[ZARCH] Z14 support, BLAS 1/2 single precision implementations, Some missing double precision implementations, Gemv optimization
This commit is contained in:
parent
ee955757f9
commit
23229011db
|
|
@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector
|
|||
FCOMMON_OPT += -march=z13 -mzvector
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), Z14)
|
||||
CCOMMON_OPT += -march=z14 -mzvector
|
||||
FCOMMON_OPT += -march=z14 -mzvector
|
||||
endif
|
||||
|
|
|
|||
|
|
@ -29,40 +29,25 @@
|
|||
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
#define CPU_Z14 2
|
||||
|
||||
static char *cpuname[] = {
|
||||
"ZARCH_GENERIC",
|
||||
"Z13"
|
||||
"Z13",
|
||||
"Z14"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"zarch_generic",
|
||||
"z13"
|
||||
"z13",
|
||||
"z14"
|
||||
};
|
||||
|
||||
int detect(void)
|
||||
{
|
||||
FILE *infile;
|
||||
char buffer[512], *p;
|
||||
|
||||
p = (char *)NULL;
|
||||
infile = fopen("/proc/sysinfo", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("Type", buffer, 4)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
#if 0
|
||||
fprintf(stderr, "%s\n", p);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
if (strstr(p, "2964")) return CPU_Z13;
|
||||
if (strstr(p, "2965")) return CPU_Z13;
|
||||
|
||||
return CPU_GENERIC;
|
||||
// return CPU_GENERIC;
|
||||
return CPU_Z14;
|
||||
|
||||
}
|
||||
|
||||
void get_libname(void)
|
||||
|
|
@ -107,5 +92,9 @@ void get_cpuconfig(void)
|
|||
printf("#define Z13\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
break;
|
||||
case CPU_Z14:
|
||||
printf("#define Z14\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,18 +1,18 @@
|
|||
SAMAXKERNEL = ../arm/amax.c
|
||||
DAMAXKERNEL = ../arm/amax.c
|
||||
DAMAXKERNEL = damax.c
|
||||
CAMAXKERNEL = ../arm/zamax.c
|
||||
ZAMAXKERNEL = ../arm/zamax.c
|
||||
ZAMAXKERNEL = zamax.c
|
||||
|
||||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = damin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = dmax.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = dmin.c
|
||||
|
||||
ISAMAXKERNEL = ../arm/iamax.c
|
||||
IDAMAXKERNEL = idamax.c
|
||||
|
|
@ -25,10 +25,10 @@ ICAMINKERNEL = ../arm/izamin.c
|
|||
IZAMINKERNEL = izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = idmax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = idmin.c
|
||||
|
||||
SASUMKERNEL = ../arm/asum.c
|
||||
DASUMKERNEL = dasum.c
|
||||
|
|
@ -74,12 +74,12 @@ ZSWAPKERNEL = zswap.c
|
|||
SGEMVNKERNEL = ../arm/gemv_n.c
|
||||
DGEMVNKERNEL = dgemv_n_4.c
|
||||
CGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
ZGEMVNKERNEL = zgemv_n_4.c
|
||||
ZGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
|
||||
SGEMVTKERNEL = ../arm/gemv_t.c
|
||||
DGEMVTKERNEL = dgemv_t_4.c
|
||||
CGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
ZGEMVTKERNEL = zgemv_t_4.c
|
||||
ZGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
|
||||
STRMMKERNEL = strmm8x4V.S
|
||||
DTRMMKERNEL = trmm8x4V.S
|
||||
|
|
|
|||
|
|
@ -0,0 +1,146 @@
|
|||
SAMAXKERNEL = samax.c
|
||||
DAMAXKERNEL = damax.c
|
||||
CAMAXKERNEL = camax.c
|
||||
ZAMAXKERNEL = zamax.c
|
||||
|
||||
SAMINKERNEL = samin.c
|
||||
DAMINKERNEL = damin.c
|
||||
CAMINKERNEL = camin.c
|
||||
ZAMINKERNEL = zamin.c
|
||||
|
||||
SMAXKERNEL = smax.c
|
||||
DMAXKERNEL = dmax.c
|
||||
|
||||
SMINKERNEL = smin.c
|
||||
DMINKERNEL = dmin.c
|
||||
|
||||
ISAMAXKERNEL = isamax.c
|
||||
IDAMAXKERNEL = idamax.c
|
||||
ICAMAXKERNEL = icamax.c
|
||||
IZAMAXKERNEL = izamax.c
|
||||
|
||||
ISAMINKERNEL = isamin.c
|
||||
IDAMINKERNEL = idamin.c
|
||||
ICAMINKERNEL = icamin.c
|
||||
IZAMINKERNEL = izamin.c
|
||||
|
||||
ISMAXKERNEL = ismax.c
|
||||
IDMAXKERNEL = idmax.c
|
||||
|
||||
ISMINKERNEL = ismin.c
|
||||
IDMINKERNEL = idmin.c
|
||||
|
||||
SASUMKERNEL = sasum.c
|
||||
DASUMKERNEL = dasum.c
|
||||
CASUMKERNEL = casum.c
|
||||
ZASUMKERNEL = zasum.c
|
||||
|
||||
SAXPYKERNEL = saxpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
CAXPYKERNEL = caxpy.c
|
||||
ZAXPYKERNEL = zaxpy.c
|
||||
|
||||
SCOPYKERNEL = scopy.c
|
||||
DCOPYKERNEL = dcopy.c
|
||||
CCOPYKERNEL = ccopy.c
|
||||
ZCOPYKERNEL = zcopy.c
|
||||
|
||||
SDOTKERNEL = sdot.c
|
||||
DDOTKERNEL = ddot.c
|
||||
CDOTKERNEL = cdot.c
|
||||
ZDOTKERNEL = zdot.c
|
||||
DSDOTKERNEL = dsdot.c
|
||||
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
DNRM2KERNEL = ../arm/nrm2.c
|
||||
CNRM2KERNEL = ../arm/znrm2.c
|
||||
ZNRM2KERNEL = ../arm/znrm2.c
|
||||
|
||||
SROTKERNEL = srot.c
|
||||
DROTKERNEL = drot.c
|
||||
CROTKERNEL = crot.c
|
||||
ZROTKERNEL = zrot.c
|
||||
|
||||
SSCALKERNEL = sscal.c
|
||||
DSCALKERNEL = dscal.c
|
||||
CSCALKERNEL = cscal.c
|
||||
ZSCALKERNEL = zscal.c
|
||||
|
||||
SSWAPKERNEL = sswap.c
|
||||
DSWAPKERNEL = dswap.c
|
||||
CSWAPKERNEL = cswap.c
|
||||
ZSWAPKERNEL = zswap.c
|
||||
|
||||
SGEMVNKERNEL = sgemv_n_4.c
|
||||
DGEMVNKERNEL = dgemv_n_4.c
|
||||
CGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
ZGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
|
||||
SGEMVTKERNEL = sgemv_t_4.c
|
||||
DGEMVTKERNEL = dgemv_t_4.c
|
||||
CGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
ZGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
|
||||
STRMMKERNEL = strmm8x4V.S
|
||||
DTRMMKERNEL = trmm8x4V.S
|
||||
CTRMMKERNEL = ctrmm4x4V.S
|
||||
ZTRMMKERNEL = ztrmm4x4V.S
|
||||
|
||||
SGEMMKERNEL = strmm8x4V.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy.o
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
|
||||
|
||||
DGEMMKERNEL = gemm8x4V.S
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy.o
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy.o
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = ctrmm4x4V.S
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = ztrmm4x4V.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,269 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
||||
|
||||
static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT amax;
|
||||
|
||||
__asm__ volatile (
|
||||
"vlef %%v0,0(%2),0 \n\t"
|
||||
"vlef %%v16,4(%2),0 \n\t"
|
||||
"vlef %%v0,8(%2),1 \n\t"
|
||||
"vlef %%v16,12(%2),1 \n\t"
|
||||
"vlef %%v0,16(%2),2 \n\t"
|
||||
"vlef %%v16,20(%2),2 \n\t"
|
||||
"vlef %%v0,24(%2),3 \n\t"
|
||||
"vlef %%v16,28(%2),3 \n\t"
|
||||
"vflpsb %%v0,%%v0 \n\t"
|
||||
"vflpsb %%v16,%%v16 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v16 \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%2) \n\t"
|
||||
|
||||
"vlef %%v16,0(%%r1,%2),0 \n\t"
|
||||
"vlef %%v17,4(%%r1,%2),0 \n\t"
|
||||
"vlef %%v16,8(%%r1,%2),1 \n\t"
|
||||
"vlef %%v17,12(%%r1,%2),1 \n\t"
|
||||
"vlef %%v16,16(%%r1,%2),2 \n\t"
|
||||
"vlef %%v17,20(%%r1,%2),2 \n\t"
|
||||
"vlef %%v16,24(%%r1,%2),3 \n\t"
|
||||
"vlef %%v17,28(%%r1,%2),3 \n\t"
|
||||
|
||||
"vlef %%v18,32(%%r1,%2),0 \n\t"
|
||||
"vlef %%v19,36(%%r1,%2),0 \n\t"
|
||||
"vlef %%v18,40(%%r1,%2),1 \n\t"
|
||||
"vlef %%v19,44(%%r1,%2),1 \n\t"
|
||||
"vlef %%v18,48(%%r1,%2),2 \n\t"
|
||||
"vlef %%v19,52(%%r1,%2),2 \n\t"
|
||||
"vlef %%v18,56(%%r1,%2),3 \n\t"
|
||||
"vlef %%v19,30(%%r1,%2),3 \n\t"
|
||||
|
||||
"vlef %%v20,64(%%r1,%2),0 \n\t"
|
||||
"vlef %%v21,68(%%r1,%2),0 \n\t"
|
||||
"vlef %%v20,72(%%r1,%2),1 \n\t"
|
||||
"vlef %%v21,76(%%r1,%2),1 \n\t"
|
||||
"vlef %%v20,80(%%r1,%2),2 \n\t"
|
||||
"vlef %%v21,84(%%r1,%2),2 \n\t"
|
||||
"vlef %%v20,88(%%r1,%2),3 \n\t"
|
||||
"vlef %%v21,92(%%r1,%2),3 \n\t"
|
||||
|
||||
"vlef %%v22,96(%%r1,%2),0 \n\t"
|
||||
"vlef %%v23,100(%%r1,%2),0 \n\t"
|
||||
"vlef %%v22,104(%%r1,%2),1 \n\t"
|
||||
"vlef %%v23,108(%%r1,%2),1 \n\t"
|
||||
"vlef %%v22,112(%%r1,%2),2 \n\t"
|
||||
"vlef %%v23,116(%%r1,%2),2 \n\t"
|
||||
"vlef %%v22,120(%%r1,%2),3 \n\t"
|
||||
"vlef %%v23,124(%%r1,%2),3 \n\t"
|
||||
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
"vfasb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfasb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfasb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfasb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchsb %%v24,%%v16,%%v17 \n\t"
|
||||
"vfchsb %%v25,%%v18,%%v19 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
|
||||
"vfchsb %%v26,%%v24,%%v25 \n\t"
|
||||
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
|
||||
|
||||
"vfchsb %%v27,%%v26,%%v0 \n\t"
|
||||
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
|
||||
|
||||
"vlef %%v16,128(%%r1,%2),0 \n\t"
|
||||
"vlef %%v17,132(%%r1,%2),0 \n\t"
|
||||
"vlef %%v16,136(%%r1,%2),1 \n\t"
|
||||
"vlef %%v17,140(%%r1,%2),1 \n\t"
|
||||
"vlef %%v16,144(%%r1,%2),2 \n\t"
|
||||
"vlef %%v17,148(%%r1,%2),2 \n\t"
|
||||
"vlef %%v16,152(%%r1,%2),3 \n\t"
|
||||
"vlef %%v17,156(%%r1,%2),3 \n\t"
|
||||
|
||||
"vlef %%v18,160(%%r1,%2),0 \n\t"
|
||||
"vlef %%v19,164(%%r1,%2),0 \n\t"
|
||||
"vlef %%v18,168(%%r1,%2),1 \n\t"
|
||||
"vlef %%v19,172(%%r1,%2),1 \n\t"
|
||||
"vlef %%v18,176(%%r1,%2),2 \n\t"
|
||||
"vlef %%v19,180(%%r1,%2),2 \n\t"
|
||||
"vlef %%v18,184(%%r1,%2),3 \n\t"
|
||||
"vlef %%v19,188(%%r1,%2),3 \n\t"
|
||||
|
||||
"vlef %%v20,192(%%r1,%2),0 \n\t"
|
||||
"vlef %%v21,196(%%r1,%2),0 \n\t"
|
||||
"vlef %%v20,200(%%r1,%2),1 \n\t"
|
||||
"vlef %%v21,204(%%r1,%2),1 \n\t"
|
||||
"vlef %%v20,208(%%r1,%2),2 \n\t"
|
||||
"vlef %%v21,212(%%r1,%2),2 \n\t"
|
||||
"vlef %%v20,216(%%r1,%2),3 \n\t"
|
||||
"vlef %%v21,220(%%r1,%2),3 \n\t"
|
||||
|
||||
"vlef %%v22,224(%%r1,%2),0 \n\t"
|
||||
"vlef %%v23,228(%%r1,%2),0 \n\t"
|
||||
"vlef %%v22,232(%%r1,%2),1 \n\t"
|
||||
"vlef %%v23,236(%%r1,%2),1 \n\t"
|
||||
"vlef %%v22,240(%%r1,%2),2 \n\t"
|
||||
"vlef %%v23,244(%%r1,%2),2 \n\t"
|
||||
"vlef %%v22,248(%%r1,%2),3 \n\t"
|
||||
"vlef %%v23,252(%%r1,%2),3 \n\t"
|
||||
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
"vfasb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfasb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfasb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfasb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchsb %%v24,%%v16,%%v17 \n\t"
|
||||
"vfchsb %%v25,%%v18,%%v19 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
|
||||
"vfchsb %%v26,%%v24,%%v25 \n\t"
|
||||
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
|
||||
|
||||
"vfchsb %%v27,%%v26,%%v0 \n\t"
|
||||
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v16,%%v0,32 \n\t"
|
||||
"vfchsb %%v17,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
|
||||
|
||||
"vrepf %%v16,%%v0,2 \n\t"
|
||||
"wfchsb %%v17,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
|
||||
"ler %0,%%f0 "
|
||||
:"=f"(amax)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
|
||||
);
|
||||
|
||||
return amax;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
maxf = camax_kernel_32(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
maxf=CABS1(x,0);
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i*2]) > maxf) {
|
||||
maxf = ABS(x[i*2]);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (maxf);
|
||||
|
||||
} else {
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
maxf=CABS1(x,0);
|
||||
i += inc_x2;
|
||||
j++;
|
||||
|
||||
BLASLONG n1 = (n - 1) & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (CABS1(x,i) > maxf) {
|
||||
maxf = CABS1(x,i);
|
||||
}
|
||||
if (CABS1(x,i+inc_x2) > maxf) {
|
||||
maxf = CABS1(x,i+inc_x2);
|
||||
}
|
||||
if (CABS1(x,i+inc_x2*2) > maxf) {
|
||||
maxf = CABS1(x,i+inc_x2*2);
|
||||
}
|
||||
if (CABS1(x,i+inc_x2*3) > maxf) {
|
||||
maxf = CABS1(x,i+inc_x2*3);
|
||||
}
|
||||
|
||||
i += inc_x2 * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (CABS1(x,i) > maxf) {
|
||||
maxf = CABS1(x,i);
|
||||
}
|
||||
i += inc_x2;
|
||||
j++;
|
||||
}
|
||||
return (maxf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,269 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
||||
|
||||
static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT amin;
|
||||
|
||||
__asm__ volatile (
|
||||
"vlef %%v0,0(%2),0 \n\t"
|
||||
"vlef %%v16,4(%2),0 \n\t"
|
||||
"vlef %%v0,8(%2),0 \n\t"
|
||||
"vlef %%v16,12(%2),0 \n\t"
|
||||
"vlef %%v0,16(%2),2 \n\t"
|
||||
"vlef %%v16,20(%2),2 \n\t"
|
||||
"vlef %%v0,24(%2),3 \n\t"
|
||||
"vlef %%v16,28(%2),3 \n\t"
|
||||
"vflpsb %%v0,%%v0 \n\t"
|
||||
"vflpsb %%v16,%%v16 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v16 \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vlef %%v16,0(%%r1,%2),0 \n\t"
|
||||
"vlef %%v17,4(%%r1,%2),0 \n\t"
|
||||
"vlef %%v16,8(%%r1,%2),0 \n\t"
|
||||
"vlef %%v17,12(%%r1,%2),0 \n\t"
|
||||
"vlef %%v16,16(%%r1,%2),2 \n\t"
|
||||
"vlef %%v17,20(%%r1,%2),2 \n\t"
|
||||
"vlef %%v16,24(%%r1,%2),3 \n\t"
|
||||
"vlef %%v17,28(%%r1,%2),3 \n\t"
|
||||
|
||||
"vlef %%v18,32(%%r1,%2),0 \n\t"
|
||||
"vlef %%v19,36(%%r1,%2),0 \n\t"
|
||||
"vlef %%v18,40(%%r1,%2),0 \n\t"
|
||||
"vlef %%v19,44(%%r1,%2),0 \n\t"
|
||||
"vlef %%v18,48(%%r1,%2),2 \n\t"
|
||||
"vlef %%v19,52(%%r1,%2),2 \n\t"
|
||||
"vlef %%v18,56(%%r1,%2),3 \n\t"
|
||||
"vlef %%v19,30(%%r1,%2),3 \n\t"
|
||||
|
||||
"vlef %%v20,64(%%r1,%2),0 \n\t"
|
||||
"vlef %%v21,68(%%r1,%2),0 \n\t"
|
||||
"vlef %%v20,72(%%r1,%2),0 \n\t"
|
||||
"vlef %%v21,76(%%r1,%2),0 \n\t"
|
||||
"vlef %%v20,80(%%r1,%2),2 \n\t"
|
||||
"vlef %%v21,84(%%r1,%2),2 \n\t"
|
||||
"vlef %%v20,88(%%r1,%2),3 \n\t"
|
||||
"vlef %%v21,92(%%r1,%2),3 \n\t"
|
||||
|
||||
"vlef %%v22,96(%%r1,%2),0 \n\t"
|
||||
"vlef %%v23,100(%%r1,%2),0 \n\t"
|
||||
"vlef %%v22,104(%%r1,%2),0 \n\t"
|
||||
"vlef %%v23,108(%%r1,%2),0 \n\t"
|
||||
"vlef %%v22,112(%%r1,%2),2 \n\t"
|
||||
"vlef %%v23,116(%%r1,%2),2 \n\t"
|
||||
"vlef %%v22,120(%%r1,%2),3 \n\t"
|
||||
"vlef %%v23,124(%%r1,%2),3 \n\t"
|
||||
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
"vfasb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfasb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfasb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfasb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchsb %%v24,%%v17,%%v16 \n\t"
|
||||
"vfchsb %%v25,%%v19,%%v18 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
|
||||
"vfchsb %%v26,%%v25,%%v24 \n\t"
|
||||
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
|
||||
|
||||
"vfchsb %%v27,%%v0,%%v26 \n\t"
|
||||
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
|
||||
|
||||
"vlef %%v16,128(%%r1,%2),0 \n\t"
|
||||
"vlef %%v17,132(%%r1,%2),0 \n\t"
|
||||
"vlef %%v16,136(%%r1,%2),0 \n\t"
|
||||
"vlef %%v17,140(%%r1,%2),0 \n\t"
|
||||
"vlef %%v16,144(%%r1,%2),2 \n\t"
|
||||
"vlef %%v17,148(%%r1,%2),2 \n\t"
|
||||
"vlef %%v16,152(%%r1,%2),3 \n\t"
|
||||
"vlef %%v17,156(%%r1,%2),3 \n\t"
|
||||
|
||||
"vlef %%v18,160(%%r1,%2),0 \n\t"
|
||||
"vlef %%v19,164(%%r1,%2),0 \n\t"
|
||||
"vlef %%v18,168(%%r1,%2),0 \n\t"
|
||||
"vlef %%v19,172(%%r1,%2),0 \n\t"
|
||||
"vlef %%v18,176(%%r1,%2),2 \n\t"
|
||||
"vlef %%v19,180(%%r1,%2),2 \n\t"
|
||||
"vlef %%v18,184(%%r1,%2),3 \n\t"
|
||||
"vlef %%v19,188(%%r1,%2),3 \n\t"
|
||||
|
||||
"vlef %%v20,192(%%r1,%2),0 \n\t"
|
||||
"vlef %%v21,196(%%r1,%2),0 \n\t"
|
||||
"vlef %%v20,200(%%r1,%2),0 \n\t"
|
||||
"vlef %%v21,204(%%r1,%2),0 \n\t"
|
||||
"vlef %%v20,208(%%r1,%2),2 \n\t"
|
||||
"vlef %%v21,212(%%r1,%2),2 \n\t"
|
||||
"vlef %%v20,216(%%r1,%2),3 \n\t"
|
||||
"vlef %%v21,220(%%r1,%2),3 \n\t"
|
||||
|
||||
"vlef %%v22,224(%%r1,%2),0 \n\t"
|
||||
"vlef %%v23,228(%%r1,%2),0 \n\t"
|
||||
"vlef %%v22,232(%%r1,%2),0 \n\t"
|
||||
"vlef %%v23,236(%%r1,%2),0 \n\t"
|
||||
"vlef %%v22,240(%%r1,%2),2 \n\t"
|
||||
"vlef %%v23,244(%%r1,%2),2 \n\t"
|
||||
"vlef %%v22,248(%%r1,%2),3 \n\t"
|
||||
"vlef %%v23,252(%%r1,%2),3 \n\t"
|
||||
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
"vfasb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfasb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfasb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfasb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchsb %%v24,%%v17,%%v16 \n\t"
|
||||
"vfchsb %%v25,%%v19,%%v18 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
|
||||
"vfchsb %%v26,%%v25,%%v24 \n\t"
|
||||
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
|
||||
|
||||
"vfchsb %%v27,%%v0,%%v26 \n\t"
|
||||
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v16,%%v0,32 \n\t"
|
||||
"vfchsb %%v17,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
|
||||
|
||||
"vrepf %%v16,%%v0,2 \n\t"
|
||||
"wfchsb %%v17,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
|
||||
"ler %0,%%f0 "
|
||||
:"=f"(amin)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
|
||||
);
|
||||
|
||||
return amin;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT minf = 0.0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (minf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
minf = camin_kernel_32(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
minf=CABS1(x,0);
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i*2]) < minf) {
|
||||
minf = ABS(x[i*2]);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (minf);
|
||||
|
||||
} else {
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
minf=CABS1(x,0);
|
||||
i += inc_x2;
|
||||
j++;
|
||||
|
||||
BLASLONG n1 = (n - 1) & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (CABS1(x,i) < minf) {
|
||||
minf = CABS1(x,i);
|
||||
}
|
||||
if (CABS1(x,i+inc_x2) < minf) {
|
||||
minf = CABS1(x,i+inc_x2);
|
||||
}
|
||||
if (CABS1(x,i+inc_x2*2) < minf) {
|
||||
minf = CABS1(x,i+inc_x2*2);
|
||||
}
|
||||
if (CABS1(x,i+inc_x2*3) < minf) {
|
||||
minf = CABS1(x,i+inc_x2*3);
|
||||
}
|
||||
|
||||
i += inc_x2 * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (CABS1(x,i) < minf) {
|
||||
minf = CABS1(x,i);
|
||||
}
|
||||
i += inc_x2;
|
||||
j++;
|
||||
}
|
||||
return (minf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,167 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT asum;
|
||||
|
||||
__asm__ (
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"vzero %%v2 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
||||
"vl %%v20, 64(%%r1,%2) \n\t"
|
||||
"vl %%v21, 80(%%r1,%2) \n\t"
|
||||
"vl %%v22, 96(%%r1,%2) \n\t"
|
||||
"vl %%v23, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfasb %%v0,%%v0,%%v16 \n\t"
|
||||
"vfasb %%v1,%%v1,%%v17 \n\t"
|
||||
"vfasb %%v2,%%v2,%%v18 \n\t"
|
||||
"vfasb %%v3,%%v3,%%v19 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v20 \n\t"
|
||||
"vfasb %%v1,%%v1,%%v21 \n\t"
|
||||
"vfasb %%v2,%%v2,%%v22 \n\t"
|
||||
"vfasb %%v3,%%v3,%%v23 \n\t"
|
||||
|
||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
||||
"vl %%v20, 192(%%r1,%2) \n\t"
|
||||
"vl %%v21, 208(%%r1,%2) \n\t"
|
||||
"vl %%v22, 224(%%r1,%2) \n\t"
|
||||
"vl %%v23, 240(%%r1,%2) \n\t"
|
||||
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfasb %%v0,%%v0,%%v16 \n\t"
|
||||
"vfasb %%v1,%%v1,%%v17 \n\t"
|
||||
"vfasb %%v2,%%v2,%%v18 \n\t"
|
||||
"vfasb %%v3,%%v3,%%v19 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v20 \n\t"
|
||||
"vfasb %%v1,%%v1,%%v21 \n\t"
|
||||
"vfasb %%v2,%%v2,%%v22 \n\t"
|
||||
"vfasb %%v3,%%v3,%%v23 \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
"vfasb %%v0,%%v0,%%v1 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v2 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v3 \n\t"
|
||||
"veslg %%v1,%%v0,32 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v1 \n\t"
|
||||
"vrepf %%v1,%%v0,2 \n\t"
|
||||
"aebr %%f0,%%f1 \n\t"
|
||||
"ler %0,%%f0 "
|
||||
:"=f"(asum)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
);
|
||||
|
||||
return asum;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ip=0;
|
||||
FLOAT sumf = 0.0;
|
||||
BLASLONG n1;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
|
||||
sumf = casum_kernel_32(n1, x);
|
||||
i=n1;
|
||||
ip=2*n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
sumf += ABS(x[ip]) + ABS(x[ip+1]);
|
||||
i++;
|
||||
ip+=2;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
inc_x2 = 2* inc_x;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
sumf += ABS(x[ip]) + ABS(x[ip+1]);
|
||||
ip+=inc_x2;
|
||||
i++;
|
||||
}
|
||||
|
||||
}
|
||||
return(sumf);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,174 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
__asm__ volatile(
|
||||
#if !defined(CONJ)
|
||||
"vlrepf %%v0,0(%3) \n\t"
|
||||
"vlef %%v1,4(%3),0 \n\t"
|
||||
"vlef %%v1,4(%3),2 \n\t"
|
||||
"vflcsb %%v1,%%v1 \n\t"
|
||||
"vlef %%v1,4(%3),1 \n\t"
|
||||
"vlef %%v1,4(%3),3 \n\t"
|
||||
#else
|
||||
"vlef %%v0,0(%3),1 \n\t"
|
||||
"vlef %%v0,0(%3),3 \n\t"
|
||||
"vflcsb %%v0,%%v0 \n\t"
|
||||
"vlef %%v0,0(%3),0 \n\t"
|
||||
"vlef %%v0,0(%3),2 \n\t"
|
||||
"vlrepf %%v1,4(%3) \n\t"
|
||||
#endif
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,16(%%r1,%1) \n\t"
|
||||
"vl %%v18,32(%%r1,%1) \n\t"
|
||||
"vl %%v19,48(%%r1,%1) \n\t"
|
||||
"vl %%v20,0(%%r1,%2) \n\t"
|
||||
"vl %%v21,16(%%r1,%2) \n\t"
|
||||
"vl %%v22,32(%%r1,%2) \n\t"
|
||||
"vl %%v23,48(%%r1,%2) \n\t"
|
||||
"verllg %%v24,%%v16,32 \n\t"
|
||||
"verllg %%v25,%%v17,32 \n\t"
|
||||
"verllg %%v26,%%v18,32 \n\t"
|
||||
"verllg %%v27,%%v19,32 \n\t"
|
||||
|
||||
"vfmasb %%v28,%%v16,%%v0,%%v20 \n\t"
|
||||
"vfmasb %%v29,%%v17,%%v0,%%v21 \n\t"
|
||||
"vfmasb %%v30,%%v18,%%v0,%%v22 \n\t"
|
||||
"vfmasb %%v31,%%v19,%%v0,%%v23 \n\t"
|
||||
|
||||
"vfmasb %%v28,%%v24,%%v1,%%v28 \n\t"
|
||||
"vfmasb %%v29,%%v25,%%v1,%%v29 \n\t"
|
||||
"vfmasb %%v30,%%v26,%%v1,%%v30 \n\t"
|
||||
"vfmasb %%v31,%%v27,%%v1,%%v31 \n\t"
|
||||
|
||||
"vst %%v28,0(%%r1,%2) \n\t"
|
||||
"vst %%v29,16(%%r1,%2) \n\t"
|
||||
"vst %%v30,32(%%r1,%2) \n\t"
|
||||
"vst %%v31,48(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,64(%%r1,%1) \n\t"
|
||||
"vl %%v17,80(%%r1,%1) \n\t"
|
||||
"vl %%v18,96(%%r1,%1) \n\t"
|
||||
"vl %%v19,112(%%r1,%1) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"verllg %%v24,%%v16,32 \n\t"
|
||||
"verllg %%v25,%%v17,32 \n\t"
|
||||
"verllg %%v26,%%v18,32 \n\t"
|
||||
"verllg %%v27,%%v19,32 \n\t"
|
||||
|
||||
"vfmasb %%v28,%%v16,%%v0,%%v20 \n\t"
|
||||
"vfmasb %%v29,%%v17,%%v0,%%v21 \n\t"
|
||||
"vfmasb %%v30,%%v18,%%v0,%%v22 \n\t"
|
||||
"vfmasb %%v31,%%v19,%%v0,%%v23 \n\t"
|
||||
|
||||
"vfmasb %%v28,%%v24,%%v1,%%v28 \n\t"
|
||||
"vfmasb %%v29,%%v25,%%v1,%%v29 \n\t"
|
||||
"vfmasb %%v30,%%v26,%%v1,%%v30 \n\t"
|
||||
"vfmasb %%v31,%%v27,%%v1,%%v31 \n\t"
|
||||
|
||||
"vst %%v28,64(%%r1,%2) \n\t"
|
||||
"vst %%v29,80(%%r1,%2) \n\t"
|
||||
"vst %%v30,96(%%r1,%2) \n\t"
|
||||
"vst %%v31,112(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0, iy = 0;
|
||||
FLOAT da[2];
|
||||
|
||||
if (n <= 0) return (0);
|
||||
|
||||
if ((inc_x == 1) && (inc_y == 1)) {
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
|
||||
if (n1) {
|
||||
da[0] = da_r;
|
||||
da[1] = da_i;
|
||||
caxpy_kernel_16(n1, x, y, da);
|
||||
ix = 2 * n1;
|
||||
}
|
||||
i = n1;
|
||||
while (i < n) {
|
||||
#if !defined(CONJ)
|
||||
y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
|
||||
y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
|
||||
#else
|
||||
y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
|
||||
y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
|
||||
#endif
|
||||
i++;
|
||||
ix += 2;
|
||||
|
||||
}
|
||||
return (0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
inc_x *= 2;
|
||||
inc_y *= 2;
|
||||
|
||||
while (i < n) {
|
||||
|
||||
#if !defined(CONJ)
|
||||
y[iy] += (da_r * x[ix] - da_i * x[ix + 1]);
|
||||
y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
|
||||
#else
|
||||
y[iy] += (da_r * x[ix] + da_i * x[ix + 1]);
|
||||
y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
|
||||
#endif
|
||||
ix += inc_x;
|
||||
iy += inc_y;
|
||||
i++;
|
||||
|
||||
}
|
||||
return (0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,99 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"lgr %%r1,%1 \n\t"
|
||||
"lgr %%r2,%2 \n\t"
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1) \n\t"
|
||||
"pfd 2, 1024(%%r2) \n\t"
|
||||
"mvc 0(256,%%r2),0(%%r1) \n\t"
|
||||
"agfi %%r1,256 \n\t"
|
||||
"agfi %%r2,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y)
|
||||
:"memory","cc","r0","r1","r2"
|
||||
);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
ccopy_kernel_32(n1, x, y);
|
||||
i=n1;
|
||||
ix=n1*2;
|
||||
iy=n1*2;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = x[iy] ;
|
||||
y[iy+1] = x[ix+1] ;
|
||||
ix+=2;
|
||||
iy+=2;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
BLASLONG inc_x2 = 2 * inc_x;
|
||||
BLASLONG inc_y2 = 2 * inc_y;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = x[ix] ;
|
||||
y[iy+1] = x[ix+1] ;
|
||||
ix += inc_x2 ;
|
||||
iy += inc_y2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
|
@ -0,0 +1,182 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vzero %%v24 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"vzero %%v26 \n\t"
|
||||
"vzero %%v27 \n\t"
|
||||
"vzero %%v28 \n\t"
|
||||
"vzero %%v29 \n\t"
|
||||
"vzero %%v30 \n\t"
|
||||
"vzero %%v31 \n\t"
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%1) \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16, 0(%%r1,%1) \n\t"
|
||||
"vl %%v17, 16(%%r1,%1) \n\t"
|
||||
"vl %%v18, 32(%%r1,%1) \n\t"
|
||||
"vl %%v19, 48(%%r1,%1) \n\t"
|
||||
"vl %%v0, 0(%%r1,%2) \n\t"
|
||||
"vl %%v1, 16(%%r1,%2) \n\t"
|
||||
"vl %%v2, 32(%%r1,%2) \n\t"
|
||||
"vl %%v3, 48(%%r1,%2) \n\t"
|
||||
"verllg %%v20,%%v16,32 \n\t"
|
||||
"verllg %%v21,%%v17,32 \n\t"
|
||||
"verllg %%v22,%%v18,32 \n\t"
|
||||
"verllg %%v23,%%v19,32 \n\t"
|
||||
|
||||
"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t"
|
||||
"vfmasb %%v25,%%v20,%%v0,%%v25 \n\t"
|
||||
"vfmasb %%v26,%%v17,%%v1,%%v26 \n\t"
|
||||
"vfmasb %%v27,%%v21,%%v1,%%v27 \n\t"
|
||||
"vfmasb %%v28,%%v18,%%v2,%%v28 \n\t"
|
||||
"vfmasb %%v29,%%v22,%%v2,%%v29 \n\t"
|
||||
"vfmasb %%v30,%%v19,%%v3,%%v30 \n\t"
|
||||
"vfmasb %%v31,%%v23,%%v3,%%v31 \n\t"
|
||||
|
||||
"vl %%v16, 64(%%r1,%1) \n\t"
|
||||
"vl %%v17, 80(%%r1,%1) \n\t"
|
||||
"vl %%v18, 96(%%r1,%1) \n\t"
|
||||
"vl %%v19, 112(%%r1,%1) \n\t"
|
||||
"vl %%v0, 64(%%r1,%2) \n\t"
|
||||
"vl %%v1, 80(%%r1,%2) \n\t"
|
||||
"vl %%v2, 96(%%r1,%2) \n\t"
|
||||
"vl %%v3, 112(%%r1,%2) \n\t"
|
||||
"verllg %%v20,%%v16,32 \n\t"
|
||||
"verllg %%v21,%%v17,32 \n\t"
|
||||
"verllg %%v22,%%v18,32 \n\t"
|
||||
"verllg %%v23,%%v19,32 \n\t"
|
||||
|
||||
"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t"
|
||||
"vfmasb %%v25,%%v20,%%v0,%%v25 \n\t"
|
||||
"vfmasb %%v26,%%v17,%%v1,%%v26 \n\t"
|
||||
"vfmasb %%v27,%%v21,%%v1,%%v27 \n\t"
|
||||
"vfmasb %%v28,%%v18,%%v2,%%v28 \n\t"
|
||||
"vfmasb %%v29,%%v22,%%v2,%%v29 \n\t"
|
||||
"vfmasb %%v30,%%v19,%%v3,%%v30 \n\t"
|
||||
"vfmasb %%v31,%%v23,%%v3,%%v31 \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
"vfasb %%v24,%%v24,%%v26 \n\t"
|
||||
"vfasb %%v24,%%v24,%%v28 \n\t"
|
||||
"vfasb %%v24,%%v24,%%v30 \n\t"
|
||||
"vrepg %%v26,%%v24,1 \n\t"
|
||||
"vfasb %%v24,%%v24,%%v26 \n\t"
|
||||
"vfasb %%v25,%%v25,%%v27 \n\t"
|
||||
"vfasb %%v25,%%v25,%%v29 \n\t"
|
||||
"vfasb %%v25,%%v25,%%v31 \n\t"
|
||||
"vrepg %%v27,%%v25,1 \n\t"
|
||||
"vfasb %%v25,%%v25,%%v27 \n\t"
|
||||
"vstef %%v24,0(%3),0 \n\t"
|
||||
"vstef %%v24,4(%3),1 \n\t"
|
||||
"vstef %%v25,8(%3),1 \n\t"
|
||||
"vstef %%v25,12(%3),0 "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||
BLASLONG i;
|
||||
BLASLONG ix, iy;
|
||||
OPENBLAS_COMPLEX_FLOAT result;
|
||||
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
|
||||
|
||||
if (n <= 0) {
|
||||
CREAL(result) = 0.0;
|
||||
CIMAG(result) = 0.0;
|
||||
return (result);
|
||||
|
||||
}
|
||||
|
||||
if ((inc_x == 1) && (inc_y == 1)) {
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
|
||||
if (n1)
|
||||
cdot_kernel_16(n1, x, y, dot);
|
||||
|
||||
i = n1;
|
||||
BLASLONG j = i * 2;
|
||||
|
||||
while (i < n) {
|
||||
|
||||
dot[0] += x[j] * y[j];
|
||||
dot[1] += x[j + 1] * y[j + 1];
|
||||
dot[2] += x[j] * y[j + 1];
|
||||
dot[3] += x[j + 1] * y[j];
|
||||
|
||||
j += 2;
|
||||
i++;
|
||||
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
i = 0;
|
||||
ix = 0;
|
||||
iy = 0;
|
||||
inc_x <<= 1;
|
||||
inc_y <<= 1;
|
||||
while (i < n) {
|
||||
|
||||
dot[0] += x[ix] * y[iy];
|
||||
dot[1] += x[ix + 1] * y[iy + 1];
|
||||
dot[2] += x[ix] * y[iy + 1];
|
||||
dot[3] += x[ix + 1] * y[iy];
|
||||
|
||||
ix += inc_x;
|
||||
iy += inc_y;
|
||||
i++;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(CONJ)
|
||||
CREAL(result) = dot[0] - dot[1];
|
||||
CIMAG(result) = dot[2] + dot[3];
|
||||
#else
|
||||
CREAL(result) = dot[0] + dot[1];
|
||||
CIMAG(result) = dot[2] - dot[3];
|
||||
|
||||
#endif
|
||||
|
||||
return (result);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,256 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||
{
|
||||
__asm__ (
|
||||
"vlrepf %%v0,%3 \n\t"
|
||||
"vlrepf %%v1,%4 \n\t"
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
"vl %%v24, 0(%%r1,%1) \n\t"
|
||||
"vl %%v25, 16(%%r1,%1) \n\t"
|
||||
"vl %%v26, 32(%%r1,%1) \n\t"
|
||||
"vl %%v27, 48(%%r1,%1) \n\t"
|
||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
||||
|
||||
"vfmsb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmsb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 0(%%r1,%1) \n\t"
|
||||
"vst %%v29, 16(%%r1,%1) \n\t"
|
||||
"vst %%v30, 32(%%r1,%1) \n\t"
|
||||
"vst %%v31, 48(%%r1,%1) \n\t"
|
||||
"vst %%v20, 0(%%r1,%2) \n\t"
|
||||
"vst %%v21, 16(%%r1,%2) \n\t"
|
||||
"vst %%v22, 32(%%r1,%2) \n\t"
|
||||
"vst %%v23, 48(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 64(%%r1,%1) \n\t"
|
||||
"vl %%v25, 80(%%r1,%1) \n\t"
|
||||
"vl %%v26, 96(%%r1,%1) \n\t"
|
||||
"vl %%v27, 112(%%r1,%1) \n\t"
|
||||
"vl %%v16, 64(%%r1,%2) \n\t"
|
||||
"vl %%v17, 80(%%r1,%2) \n\t"
|
||||
"vl %%v18, 96(%%r1,%2) \n\t"
|
||||
"vl %%v19, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vfmsb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmsb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 64(%%r1,%1) \n\t"
|
||||
"vst %%v29, 80(%%r1,%1) \n\t"
|
||||
"vst %%v30, 96(%%r1,%1) \n\t"
|
||||
"vst %%v31, 112(%%r1,%1) \n\t"
|
||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
||||
|
||||
"vfmsb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmsb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 128(%%r1,%1) \n\t"
|
||||
"vst %%v29, 144(%%r1,%1) \n\t"
|
||||
"vst %%v30, 160(%%r1,%1) \n\t"
|
||||
"vst %%v31, 176(%%r1,%1) \n\t"
|
||||
"vst %%v20, 128(%%r1,%2) \n\t"
|
||||
"vst %%v21, 144(%%r1,%2) \n\t"
|
||||
"vst %%v22, 160(%%r1,%2) \n\t"
|
||||
"vst %%v23, 176(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 192(%%r1,%1) \n\t"
|
||||
"vl %%v25, 208(%%r1,%1) \n\t"
|
||||
"vl %%v26, 224(%%r1,%1) \n\t"
|
||||
"vl %%v27, 240(%%r1,%1) \n\t"
|
||||
"vl %%v16, 192(%%r1,%2) \n\t"
|
||||
"vl %%v17, 208(%%r1,%2) \n\t"
|
||||
"vl %%v18, 224(%%r1,%2) \n\t"
|
||||
"vl %%v19, 240(%%r1,%2) \n\t"
|
||||
|
||||
"vfmsb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmsb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 192(%%r1,%1) \n\t"
|
||||
"vst %%v29, 208(%%r1,%1) \n\t"
|
||||
"vst %%v30, 224(%%r1,%1) \n\t"
|
||||
"vst %%v31, 240(%%r1,%1) \n\t"
|
||||
"vst %%v20, 192(%%r1,%2) \n\t"
|
||||
"vst %%v21, 208(%%r1,%2) \n\t"
|
||||
"vst %%v22, 224(%%r1,%2) \n\t"
|
||||
"vst %%v23, 240(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT temp[2];
|
||||
BLASLONG inc_x2;
|
||||
BLASLONG inc_y2;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
FLOAT cosa,sina;
|
||||
cosa=c;
|
||||
sina=s;
|
||||
crot_kernel_32(n1, x, y, &cosa, &sina);
|
||||
i=n1;
|
||||
ix=2*n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp[0] = c*x[ix] + s*y[ix] ;
|
||||
temp[1] = c*x[ix+1] + s*y[ix+1] ;
|
||||
y[ix] = c*y[ix] - s*x[ix] ;
|
||||
y[ix+1] = c*y[ix+1] - s*x[ix+1] ;
|
||||
x[ix] = temp[0] ;
|
||||
x[ix+1] = temp[1] ;
|
||||
|
||||
ix += 2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
inc_x2 = 2 * inc_x ;
|
||||
inc_y2 = 2 * inc_y ;
|
||||
while(i < n)
|
||||
{
|
||||
temp[0] = c*x[ix] + s*y[iy] ;
|
||||
temp[1] = c*x[ix+1] + s*y[iy+1] ;
|
||||
y[iy] = c*y[iy] - s*x[ix] ;
|
||||
y[iy+1] = c*y[iy+1] - s*x[ix+1] ;
|
||||
x[ix] = temp[0] ;
|
||||
x[ix+1] = temp[1] ;
|
||||
|
||||
ix += inc_x2 ;
|
||||
iy += inc_y2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,456 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013 - 2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vlrepf %%v0,0(%1) \n\t"
|
||||
"vlef %%v1,4(%1),0 \n\t"
|
||||
"vlef %%v1,4(%1),2 \n\t"
|
||||
"vflcsb %%v1,%%v1 \n\t"
|
||||
"vlef %%v1,4(%1),1 \n\t"
|
||||
"vlef %%v1,4(%1),3 \n\t"
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"verllg %%v24,%%v16,32 \n\t"
|
||||
"verllg %%v25,%%v17,32 \n\t"
|
||||
"verllg %%v26,%%v18,32 \n\t"
|
||||
"verllg %%v27,%%v19,32 \n\t"
|
||||
"verllg %%v28,%%v20,32 \n\t"
|
||||
"verllg %%v29,%%v21,32 \n\t"
|
||||
"verllg %%v30,%%v22,32 \n\t"
|
||||
"verllg %%v31,%%v23,32 \n\t"
|
||||
|
||||
"vfmsb %%v16,%%v16,%%v0 \n\t"
|
||||
"vfmsb %%v17,%%v17,%%v0 \n\t"
|
||||
"vfmsb %%v18,%%v18,%%v0 \n\t"
|
||||
"vfmsb %%v19,%%v19,%%v0 \n\t"
|
||||
"vfmsb %%v20,%%v20,%%v0 \n\t"
|
||||
"vfmsb %%v21,%%v21,%%v0 \n\t"
|
||||
"vfmsb %%v22,%%v22,%%v0 \n\t"
|
||||
"vfmsb %%v23,%%v23,%%v0 \n\t"
|
||||
"vfmasb %%v16,%%v24,%%v1,%%v16 \n\t"
|
||||
"vfmasb %%v17,%%v25,%%v1,%%v17 \n\t"
|
||||
"vfmasb %%v18,%%v26,%%v1,%%v18 \n\t"
|
||||
"vfmasb %%v19,%%v27,%%v1,%%v19 \n\t"
|
||||
"vfmasb %%v20,%%v28,%%v1,%%v20 \n\t"
|
||||
"vfmasb %%v21,%%v29,%%v1,%%v21 \n\t"
|
||||
"vfmasb %%v22,%%v30,%%v1,%%v22 \n\t"
|
||||
"vfmasb %%v23,%%v31,%%v1,%%v23 \n\t"
|
||||
|
||||
"vst %%v16,0(%%r1,%2) \n\t"
|
||||
"vst %%v17,16(%%r1,%2) \n\t"
|
||||
"vst %%v18,32(%%r1,%2) \n\t"
|
||||
"vst %%v19,48(%%r1,%2) \n\t"
|
||||
"vst %%v20,64(%%r1,%2) \n\t"
|
||||
"vst %%v21,80(%%r1,%2) \n\t"
|
||||
"vst %%v22,96(%%r1,%2) \n\t"
|
||||
"vst %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vlef %%v0,4(%1),0 \n\t"
|
||||
"vlef %%v0,4(%1),2 \n\t"
|
||||
"vflcsb %%v0,%%v0 \n\t"
|
||||
"vlef %%v0,4(%1),1 \n\t"
|
||||
"vlef %%v0,4(%1),3 \n\t"
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"verllg %%v16,%%v16,32 \n\t"
|
||||
"verllg %%v17,%%v17,32 \n\t"
|
||||
"verllg %%v18,%%v18,32 \n\t"
|
||||
"verllg %%v19,%%v19,32 \n\t"
|
||||
"verllg %%v20,%%v20,32 \n\t"
|
||||
"verllg %%v21,%%v21,32 \n\t"
|
||||
"verllg %%v22,%%v22,32 \n\t"
|
||||
"verllg %%v23,%%v23,32 \n\t"
|
||||
|
||||
"vfmsb %%v16,%%v16,%%v0 \n\t"
|
||||
"vfmsb %%v17,%%v17,%%v0 \n\t"
|
||||
"vfmsb %%v18,%%v18,%%v0 \n\t"
|
||||
"vfmsb %%v19,%%v19,%%v0 \n\t"
|
||||
"vfmsb %%v20,%%v20,%%v0 \n\t"
|
||||
"vfmsb %%v21,%%v21,%%v0 \n\t"
|
||||
"vfmsb %%v22,%%v22,%%v0 \n\t"
|
||||
"vfmsb %%v23,%%v23,%%v0 \n\t"
|
||||
|
||||
"vst %%v16,0(%%r1,%2) \n\t"
|
||||
"vst %%v17,16(%%r1,%2) \n\t"
|
||||
"vst %%v18,32(%%r1,%2) \n\t"
|
||||
"vst %%v19,48(%%r1,%2) \n\t"
|
||||
"vst %%v20,64(%%r1,%2) \n\t"
|
||||
"vst %%v21,80(%%r1,%2) \n\t"
|
||||
"vst %%v22,96(%%r1,%2) \n\t"
|
||||
"vst %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
);
|
||||
}
|
||||
|
||||
static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vlrepf %%v0,0(%1) \n\t"
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vfmsb %%v16,%%v16,%%v0 \n\t"
|
||||
"vfmsb %%v17,%%v17,%%v0 \n\t"
|
||||
"vfmsb %%v18,%%v18,%%v0 \n\t"
|
||||
"vfmsb %%v19,%%v19,%%v0 \n\t"
|
||||
"vfmsb %%v20,%%v20,%%v0 \n\t"
|
||||
"vfmsb %%v21,%%v21,%%v0 \n\t"
|
||||
"vfmsb %%v22,%%v22,%%v0 \n\t"
|
||||
"vfmsb %%v23,%%v23,%%v0 \n\t"
|
||||
|
||||
"vst %%v16,0(%%r1,%2) \n\t"
|
||||
"vst %%v17,16(%%r1,%2) \n\t"
|
||||
"vst %%v18,32(%%r1,%2) \n\t"
|
||||
"vst %%v19,48(%%r1,%2) \n\t"
|
||||
"vst %%v20,64(%%r1,%2) \n\t"
|
||||
"vst %%v21,80(%%r1,%2) \n\t"
|
||||
"vst %%v22,96(%%r1,%2) \n\t"
|
||||
"vst %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
);
|
||||
}
|
||||
|
||||
static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vzero %%v24 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"vzero %%v26 \n\t"
|
||||
"vzero %%v27 \n\t"
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
|
||||
"vst %%v24,0(%%r1,%1) \n\t"
|
||||
"vst %%v25,16(%%r1,%1) \n\t"
|
||||
"vst %%v26,32(%%r1,%1) \n\t"
|
||||
"vst %%v27,48(%%r1,%1) \n\t"
|
||||
"vst %%v24,64(%%r1,%1) \n\t"
|
||||
"vst %%v25,80(%%r1,%1) \n\t"
|
||||
"vst %%v26,96(%%r1,%1) \n\t"
|
||||
"vst %%v27,112(%%r1,%1) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v24","v25","v26","v27"
|
||||
);
|
||||
}
|
||||
|
||||
static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG inc_x2 = 2 * inc_x;
|
||||
BLASLONG inc_x3 = inc_x2 + inc_x;
|
||||
FLOAT t0, t1, t2, t3;
|
||||
FLOAT da_r = alpha[0];
|
||||
FLOAT da_i = alpha[1];
|
||||
|
||||
for (i = 0; i < n; i += 4)
|
||||
{
|
||||
t0 = da_r * x[0] - da_i * x[1];
|
||||
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
|
||||
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
|
||||
t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1];
|
||||
|
||||
x[1] = da_i * x[0] + da_r * x[1];
|
||||
x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1];
|
||||
x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1];
|
||||
x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1];
|
||||
|
||||
x[0] = t0;
|
||||
x[inc_x] = t1;
|
||||
x[inc_x2] = t2;
|
||||
x[inc_x3] = t3;
|
||||
|
||||
x += 4 * inc_x;
|
||||
}
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
|
||||
BLASLONG i = 0, j = 0;
|
||||
FLOAT temp0;
|
||||
FLOAT temp1;
|
||||
FLOAT alpha[2] __attribute__ ((aligned(16)));
|
||||
|
||||
if (inc_x != 1) {
|
||||
inc_x <<= 1;
|
||||
|
||||
if (da_r == 0.0) {
|
||||
|
||||
BLASLONG n1 = n & -2;
|
||||
|
||||
if (da_i == 0.0) {
|
||||
|
||||
while (j < n1) {
|
||||
|
||||
x[i] = 0.0;
|
||||
x[i + 1] = 0.0;
|
||||
x[i + inc_x] = 0.0;
|
||||
x[i + 1 + inc_x] = 0.0;
|
||||
i += 2 * inc_x;
|
||||
j += 2;
|
||||
|
||||
}
|
||||
|
||||
while (j < n) {
|
||||
|
||||
x[i] = 0.0;
|
||||
x[i + 1] = 0.0;
|
||||
i += inc_x;
|
||||
j++;
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
while (j < n1) {
|
||||
|
||||
temp0 = -da_i * x[i + 1];
|
||||
x[i + 1] = da_i * x[i];
|
||||
x[i] = temp0;
|
||||
temp1 = -da_i * x[i + 1 + inc_x];
|
||||
x[i + 1 + inc_x] = da_i * x[i + inc_x];
|
||||
x[i + inc_x] = temp1;
|
||||
i += 2 * inc_x;
|
||||
j += 2;
|
||||
|
||||
}
|
||||
|
||||
while (j < n) {
|
||||
|
||||
temp0 = -da_i * x[i + 1];
|
||||
x[i + 1] = da_i * x[i];
|
||||
x[i] = temp0;
|
||||
i += inc_x;
|
||||
j++;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
|
||||
if (da_i == 0.0) {
|
||||
BLASLONG n1 = n & -2;
|
||||
|
||||
while (j < n1) {
|
||||
|
||||
temp0 = da_r * x[i];
|
||||
x[i + 1] = da_r * x[i + 1];
|
||||
x[i] = temp0;
|
||||
temp1 = da_r * x[i + inc_x];
|
||||
x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x];
|
||||
x[i + inc_x] = temp1;
|
||||
i += 2 * inc_x;
|
||||
j += 2;
|
||||
|
||||
}
|
||||
|
||||
while (j < n) {
|
||||
|
||||
temp0 = da_r * x[i];
|
||||
x[i + 1] = da_r * x[i + 1];
|
||||
x[i] = temp0;
|
||||
i += inc_x;
|
||||
j++;
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG n1 = n & -8;
|
||||
if (n1 > 0) {
|
||||
alpha[0] = da_r;
|
||||
alpha[1] = da_i;
|
||||
cscal_kernel_inc_8(n1, alpha, x, inc_x);
|
||||
j = n1;
|
||||
i = n1 * inc_x;
|
||||
}
|
||||
|
||||
while (j < n) {
|
||||
|
||||
temp0 = da_r * x[i] - da_i * x[i + 1];
|
||||
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
|
||||
x[i] = temp0;
|
||||
i += inc_x;
|
||||
j++;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
alpha[0] = da_r;
|
||||
alpha[1] = da_i;
|
||||
|
||||
if (da_r == 0.0)
|
||||
if (da_i == 0)
|
||||
cscal_kernel_16_zero(n1, x);
|
||||
else
|
||||
cscal_kernel_16_zero_r(n1, alpha, x);
|
||||
else
|
||||
if (da_i == 0)
|
||||
cscal_kernel_16_zero_i(n1, alpha, x);
|
||||
else
|
||||
cscal_kernel_16(n1, alpha, x);
|
||||
|
||||
i = n1 << 1;
|
||||
j = n1;
|
||||
}
|
||||
|
||||
|
||||
if (da_r == 0.0) {
|
||||
|
||||
if (da_i == 0.0) {
|
||||
|
||||
while (j < n) {
|
||||
|
||||
x[i] = 0.0;
|
||||
x[i + 1] = 0.0;
|
||||
i += 2;
|
||||
j++;
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
while (j < n) {
|
||||
|
||||
temp0 = -da_i * x[i + 1];
|
||||
x[i + 1] = da_i * x[i];
|
||||
x[i] = temp0;
|
||||
i += 2;
|
||||
j++;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
if (da_i == 0.0) {
|
||||
|
||||
while (j < n) {
|
||||
|
||||
temp0 = da_r * x[i];
|
||||
x[i + 1] = da_r * x[i + 1];
|
||||
x[i] = temp0;
|
||||
i += 2;
|
||||
j++;
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
while (j < n) {
|
||||
|
||||
temp0 = da_r * x[i] - da_i * x[i + 1];
|
||||
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
|
||||
x[i] = temp0;
|
||||
i += 2;
|
||||
j++;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
|
@ -0,0 +1,183 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16, 0(%%r1,%1) \n\t"
|
||||
"vl %%v17, 16(%%r1,%1) \n\t"
|
||||
"vl %%v18, 32(%%r1,%1) \n\t"
|
||||
"vl %%v19, 48(%%r1,%1) \n\t"
|
||||
"vl %%v20, 64(%%r1,%1) \n\t"
|
||||
"vl %%v21, 80(%%r1,%1) \n\t"
|
||||
"vl %%v22, 96(%%r1,%1) \n\t"
|
||||
"vl %%v23, 112(%%r1,%1) \n\t"
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vl %%v28, 192(%%r1,%1) \n\t"
|
||||
"vl %%v29, 208(%%r1,%1) \n\t"
|
||||
"vl %%v30, 224(%%r1,%1) \n\t"
|
||||
"vl %%v31, 240(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v0, 0(%%r1,%2) \n\t"
|
||||
"vl %%v1, 16(%%r1,%2) \n\t"
|
||||
"vl %%v2, 32(%%r1,%2) \n\t"
|
||||
"vl %%v3, 48(%%r1,%2) \n\t"
|
||||
"vl %%v4, 64(%%r1,%2) \n\t"
|
||||
"vl %%v5, 80(%%r1,%2) \n\t"
|
||||
"vl %%v6, 96(%%r1,%2) \n\t"
|
||||
"vl %%v7, 112(%%r1,%2) \n\t"
|
||||
"vst %%v0, 0(%%r1,%1) \n\t"
|
||||
"vst %%v1, 16(%%r1,%1) \n\t"
|
||||
"vst %%v2, 32(%%r1,%1) \n\t"
|
||||
"vst %%v3, 48(%%r1,%1) \n\t"
|
||||
"vst %%v4, 64(%%r1,%1) \n\t"
|
||||
"vst %%v5, 80(%%r1,%1) \n\t"
|
||||
"vst %%v6, 96(%%r1,%1) \n\t"
|
||||
"vst %%v7, 112(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v0, 128(%%r1,%2) \n\t"
|
||||
"vl %%v1, 144(%%r1,%2) \n\t"
|
||||
"vl %%v2, 160(%%r1,%2) \n\t"
|
||||
"vl %%v3, 176(%%r1,%2) \n\t"
|
||||
"vl %%v4, 192(%%r1,%2) \n\t"
|
||||
"vl %%v5, 208(%%r1,%2) \n\t"
|
||||
"vl %%v6, 224(%%r1,%2) \n\t"
|
||||
"vl %%v7, 240(%%r1,%2) \n\t"
|
||||
"vst %%v0, 128(%%r1,%1) \n\t"
|
||||
"vst %%v1, 144(%%r1,%1) \n\t"
|
||||
"vst %%v2, 160(%%r1,%1) \n\t"
|
||||
"vst %%v3, 176(%%r1,%1) \n\t"
|
||||
"vst %%v4, 192(%%r1,%1) \n\t"
|
||||
"vst %%v5, 208(%%r1,%1) \n\t"
|
||||
"vst %%v6, 224(%%r1,%1) \n\t"
|
||||
"vst %%v7, 240(%%r1,%1) \n\t"
|
||||
|
||||
"vst %%v16, 0(%%r1,%2) \n\t"
|
||||
"vst %%v17, 16(%%r1,%2) \n\t"
|
||||
"vst %%v18, 32(%%r1,%2) \n\t"
|
||||
"vst %%v19, 48(%%r1,%2) \n\t"
|
||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
||||
"vst %%v24, 128(%%r1,%2) \n\t"
|
||||
"vst %%v25, 144(%%r1,%2) \n\t"
|
||||
"vst %%v26, 160(%%r1,%2) \n\t"
|
||||
"vst %%v27, 176(%%r1,%2) \n\t"
|
||||
"vst %%v28, 192(%%r1,%2) \n\t"
|
||||
"vst %%v29, 208(%%r1,%2) \n\t"
|
||||
"vst %%v30, 224(%%r1,%2) \n\t"
|
||||
"vst %%v31, 240(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT temp[2];
|
||||
BLASLONG inc_x2, inc_y2;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
cswap_kernel_32(n1, x, y);
|
||||
i=n1;
|
||||
ix = 2* n1;
|
||||
iy = 2* n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
temp[0] = x[ix] ;
|
||||
temp[1] = x[ix+1] ;
|
||||
x[ix] = y[iy] ;
|
||||
x[ix+1] = y[iy+1] ;
|
||||
y[iy] = temp[0] ;
|
||||
y[iy+1] = temp[1] ;
|
||||
|
||||
ix += 2 ;
|
||||
iy += 2 ;
|
||||
i++ ;
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
temp[0] = x[ix] ;
|
||||
temp[1] = x[ix+1] ;
|
||||
x[ix] = y[iy] ;
|
||||
x[ix+1] = y[iy+1] ;
|
||||
y[iy] = temp[0] ;
|
||||
y[iy+1] = temp[1] ;
|
||||
|
||||
ix += inc_x2 ;
|
||||
iy += inc_y2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,206 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT amax;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%2) \n\t"
|
||||
"vflpdb %%v0,%%v0 \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v16,%%v17 \n\t"
|
||||
"vfchdb %%v25,%%v18,%%v19 \n\t"
|
||||
"vfchdb %%v26,%%v20,%%v21 \n\t"
|
||||
"vfchdb %%v27,%%v22,%%v23 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v28,%%v24,%%v25 \n\t"
|
||||
"vfchdb %%v29,%%v26,%%v27 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchdb %%v30,%%v28,%%v29 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchdb %%v31,%%v30,%%v0 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%2) \n\t"
|
||||
"vl %%v17,144(%%r1,%2) \n\t"
|
||||
"vl %%v18,160(%%r1,%2) \n\t"
|
||||
"vl %%v19,176(%%r1,%2) \n\t"
|
||||
"vl %%v20,192(%%r1,%2) \n\t"
|
||||
"vl %%v21,208(%%r1,%2) \n\t"
|
||||
"vl %%v22,224(%%r1,%2) \n\t"
|
||||
"vl %%v23,240(%%r1,%2) \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v16,%%v17 \n\t"
|
||||
"vfchdb %%v25,%%v18,%%v19 \n\t"
|
||||
"vfchdb %%v26,%%v20,%%v21 \n\t"
|
||||
"vfchdb %%v27,%%v22,%%v23 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v28,%%v24,%%v25 \n\t"
|
||||
"vfchdb %%v29,%%v26,%%v27 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchdb %%v30,%%v28,%%v29 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchdb %%v31,%%v30,%%v0 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v16,%%v0,1 \n\t"
|
||||
"wfchdb %%v17,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(amax)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return amax;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
maxf = damax_kernel_32(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
maxf=ABS(x[0]);
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) > maxf) {
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (maxf);
|
||||
|
||||
} else {
|
||||
|
||||
maxf=ABS(x[0]);
|
||||
i += inc_x;
|
||||
j++;
|
||||
|
||||
BLASLONG n1 = (n - 1) & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (ABS(x[i]) > maxf) {
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
if (ABS(x[i + inc_x]) > maxf) {
|
||||
maxf = ABS(x[i + inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 2 * inc_x]) > maxf) {
|
||||
maxf = ABS(x[i + 2 * inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 3 * inc_x]) > maxf) {
|
||||
maxf = ABS(x[i + 3 * inc_x]);
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (ABS(x[i]) > maxf) {
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (maxf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,206 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT amin;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%2) \n\t"
|
||||
"vflpdb %%v0,%%v0 \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v17,%%v16 \n\t"
|
||||
"vfchdb %%v25,%%v19,%%v18 \n\t"
|
||||
"vfchdb %%v26,%%v21,%%v20 \n\t"
|
||||
"vfchdb %%v27,%%v23,%%v22 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v28,%%v25,%%v24 \n\t"
|
||||
"vfchdb %%v29,%%v27,%%v26 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchdb %%v30,%%v29,%%v28 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchdb %%v31,%%v0,%%v30 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%2) \n\t"
|
||||
"vl %%v17,144(%%r1,%2) \n\t"
|
||||
"vl %%v18,160(%%r1,%2) \n\t"
|
||||
"vl %%v19,176(%%r1,%2) \n\t"
|
||||
"vl %%v20,192(%%r1,%2) \n\t"
|
||||
"vl %%v21,208(%%r1,%2) \n\t"
|
||||
"vl %%v22,224(%%r1,%2) \n\t"
|
||||
"vl %%v23,240(%%r1,%2) \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v17,%%v16 \n\t"
|
||||
"vfchdb %%v25,%%v19,%%v18 \n\t"
|
||||
"vfchdb %%v26,%%v21,%%v20 \n\t"
|
||||
"vfchdb %%v27,%%v23,%%v22 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v28,%%v25,%%v24 \n\t"
|
||||
"vfchdb %%v29,%%v27,%%v26 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchdb %%v30,%%v29,%%v28 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchdb %%v31,%%v0,%%v30 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v16,%%v0,1 \n\t"
|
||||
"wfchdb %%v17,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(amin)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return amin;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT minf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (minf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
minf = damin_kernel_32(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
minf=ABS(x[0]);
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) < minf) {
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (minf);
|
||||
|
||||
} else {
|
||||
|
||||
minf=ABS(x[0]);
|
||||
i += inc_x;
|
||||
j++;
|
||||
|
||||
BLASLONG n1 = (n - 1) & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (ABS(x[i]) < minf) {
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
if (ABS(x[i + inc_x]) < minf) {
|
||||
minf = ABS(x[i + inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 2 * inc_x]) < minf) {
|
||||
minf = ABS(x[i + 2 * inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 3 * inc_x]) < minf) {
|
||||
minf = ABS(x[i + 3 * inc_x]);
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (ABS(x[i]) < minf) {
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (minf);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
|
@ -23,8 +23,7 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
|
@ -35,80 +34,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT asum;
|
||||
|
||||
|
||||
__asm__ (
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"vzero %%v2 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
||||
"vl %%v20, 64(%%r1,%2) \n\t"
|
||||
"vl %%v21, 80(%%r1,%2) \n\t"
|
||||
"vl %%v22, 96(%%r1,%2) \n\t"
|
||||
"vl %%v23, 112(%%r1,%2) \n\t"
|
||||
|
||||
static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
|
||||
FLOAT asum ;
|
||||
__asm__ (
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"sllg %%r0,%[n],3 \n\t"
|
||||
"agr %%r0,%[ptr_x] \n\t"
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"vzero %%v2 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%[ptr_temp] ) \n\t"
|
||||
"vlm %%v24,%%v31, 0(%[ptr_temp] ) \n\t"
|
||||
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
|
||||
"vfadb %%v0,%%v0,%%v24 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v25 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v26 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v27 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v28 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v29 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v30 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v31 \n\t"
|
||||
|
||||
"vlm %%v24,%%v31, 128(%[ptr_temp]) \n\t"
|
||||
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
"la %[ptr_temp],256(%[ptr_temp]) \n\t"
|
||||
"vfadb %%v0,%%v0,%%v24 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v25 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v26 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v27 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v28 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v29 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v30 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v31 \n\t"
|
||||
|
||||
"clgrjl %[ptr_temp],%%r0,1b \n\t"
|
||||
"vfadb %%v24,%%v0,%%v1 \n\t"
|
||||
"vfadb %%v25,%%v2,%%v3 \n\t"
|
||||
"vfadb %%v0,%%v25,%%v24 \n\t"
|
||||
"vrepg %%v1,%%v0,1 \n\t"
|
||||
"adbr %%f0,%%f1 \n\t"
|
||||
"ldr %[asum],%%f0 \n\t"
|
||||
: [asum] "=f"(asum),[ptr_temp] "+&a"(x)
|
||||
: [mem] "m"( *(const double (*)[n])x ), [n] "r"(n), [ptr_x] "a"(x)
|
||||
: "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return asum;
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfadb %%v0,%%v0,%%v16 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v17 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v18 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v19 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v20 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v21 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v22 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v23 \n\t"
|
||||
|
||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
||||
"vl %%v20, 192(%%r1,%2) \n\t"
|
||||
"vl %%v21, 208(%%r1,%2) \n\t"
|
||||
"vl %%v22, 224(%%r1,%2) \n\t"
|
||||
"vl %%v23, 240(%%r1,%2) \n\t"
|
||||
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfadb %%v0,%%v0,%%v16 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v17 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v18 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v19 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v20 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v21 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v22 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v23 \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
"vfadb %%v0,%%v0,%%v1 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v2 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v3 \n\t"
|
||||
"vrepg %%v1,%%v0,1 \n\t"
|
||||
"adbr %%f0,%%f1 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(asum)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
);
|
||||
|
||||
return asum;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
|
|
|
|||
|
|
@ -25,98 +25,99 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define PREFETCH_INS 1
|
||||
#if defined(Z13_A)
|
||||
#include <vecintrin.h>
|
||||
|
||||
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
__vector double v_a = {alpha,alpha};
|
||||
__vector double * v_y=(__vector double *)y;
|
||||
__vector double * v_x=(__vector double *)x;
|
||||
|
||||
for(; i<n/2; i+=16){
|
||||
|
||||
v_y[i] += v_a * v_x[i];
|
||||
v_y[i+1] += v_a * v_x[i+1];
|
||||
v_y[i+2] += v_a * v_x[i+2];
|
||||
v_y[i+3] += v_a * v_x[i+3];
|
||||
v_y[i+4] += v_a * v_x[i+4];
|
||||
v_y[i+5] += v_a * v_x[i+5];
|
||||
v_y[i+6] += v_a * v_x[i+6];
|
||||
v_y[i+7] += v_a * v_x[i+7];
|
||||
v_y[i+8] += v_a * v_x[i+8];
|
||||
v_y[i+9] += v_a * v_x[i+9];
|
||||
v_y[i+10] += v_a * v_x[i+10];
|
||||
v_y[i+11] += v_a * v_x[i+11];
|
||||
v_y[i+12] += v_a * v_x[i+12];
|
||||
v_y[i+13] += v_a * v_x[i+13];
|
||||
v_y[i+14] += v_a * v_x[i+14];
|
||||
v_y[i+15] += v_a * v_x[i+15];
|
||||
}
|
||||
|
||||
}
|
||||
#else
|
||||
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
|
||||
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vlrepg %%v0,%3 \n\t"
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,16(%%r1,%1) \n\t"
|
||||
"vl %%v18,32(%%r1,%1) \n\t"
|
||||
"vl %%v19,48(%%r1,%1) \n\t"
|
||||
"vl %%v20,0(%%r1,%2) \n\t"
|
||||
"vl %%v21,16(%%r1,%2) \n\t"
|
||||
"vl %%v22,32(%%r1,%2) \n\t"
|
||||
"vl %%v23,48(%%r1,%2) \n\t"
|
||||
|
||||
__asm__ volatile(
|
||||
#if defined(PREFETCH_INS)
|
||||
"pfd 1, 0(%[x_tmp]) \n\t"
|
||||
"pfd 2, 0(%[y_tmp]) \n\t"
|
||||
#endif
|
||||
"lgdr %%r0,%[alpha] \n\t"
|
||||
"vlvgp %%v0,%%r0,%%r0 \n\t"
|
||||
"srlg %%r0,%[n],5 \n\t"
|
||||
"vlr %%v1,%%v0 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
#if defined(PREFETCH_INS)
|
||||
"pfd 1, 256(%[x_tmp]) \n\t"
|
||||
"pfd 2, 256(%[y_tmp]) \n\t"
|
||||
#endif
|
||||
"vlm %%v16,%%v23, 0(%[x_tmp]) \n\t"
|
||||
"vlm %%v24, %%v31, 0(%[y_tmp]) \n\t"
|
||||
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
|
||||
"vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
|
||||
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
|
||||
"vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
|
||||
"vfmadb %%v20,%%v0,%%v20,%%v28 \n\t"
|
||||
"vfmadb %%v21,%%v1,%%v21,%%v29 \n\t"
|
||||
"vfmadb %%v22,%%v0,%%v22,%%v30 \n\t"
|
||||
"vfmadb %%v23,%%v1,%%v23,%%v31 \n\t"
|
||||
"vstm %%v16,%%v23, 0(%[y_tmp]) \n\t"
|
||||
"vlm %%v24,%%v31, 128(%[x_tmp]) \n\t"
|
||||
"vlm %%v16,%%v23, 128(%[y_tmp]) \n\t"
|
||||
"vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
|
||||
"vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
|
||||
"vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
|
||||
"vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
|
||||
"vfmadb %%v28,%%v0,%%v28,%%v20 \n\t"
|
||||
"vfmadb %%v29,%%v1,%%v29,%%v21 \n\t"
|
||||
"vfmadb %%v30,%%v0,%%v30,%%v22 \n\t"
|
||||
"vfmadb %%v31,%%v1,%%v31,%%v23 \n\t"
|
||||
"la %[x_tmp],256(%[x_tmp]) \n\t"
|
||||
"vstm %%v24, %%v31, 128(%[y_tmp]) \n\t"
|
||||
"la %[y_tmp],256(%[y_tmp]) \n\t"
|
||||
"brctg %%r0,1b"
|
||||
: [mem_y] "+m" (*(double (*)[n])y), [x_tmp] "+&a"(x), [y_tmp] "+&a"(y)
|
||||
: [mem_x] "m" (*(const double (*)[n])x), [n] "r"(n), [alpha] "f"(alpha)
|
||||
:"cc", "r0", "v0","v1","v16","v17","v18","v19","v20","v21",
|
||||
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
"vfmadb %%v16,%%v0,%%v16,%%v20 \n\t"
|
||||
"vfmadb %%v17,%%v0,%%v17,%%v21 \n\t"
|
||||
"vfmadb %%v18,%%v0,%%v18,%%v22 \n\t"
|
||||
"vfmadb %%v19,%%v0,%%v19,%%v23 \n\t"
|
||||
|
||||
"vl %%v24,64(%%r1,%1) \n\t"
|
||||
"vl %%v25,80(%%r1,%1) \n\t"
|
||||
"vl %%v26,96(%%r1,%1) \n\t"
|
||||
"vl %%v27,112(%%r1,%1) \n\t"
|
||||
"vl %%v28,64(%%r1,%2) \n\t"
|
||||
"vl %%v29,80(%%r1,%2) \n\t"
|
||||
"vl %%v30,96(%%r1,%2) \n\t"
|
||||
"vl %%v31,112(%%r1,%2) \n\t"
|
||||
|
||||
"vfmadb %%v20,%%v0,%%v24,%%v28 \n\t"
|
||||
"vfmadb %%v21,%%v0,%%v25,%%v29 \n\t"
|
||||
"vfmadb %%v22,%%v0,%%v26,%%v30 \n\t"
|
||||
"vfmadb %%v23,%%v0,%%v27,%%v31 \n\t"
|
||||
|
||||
"vst %%v16,0(%%r1,%2) \n\t"
|
||||
"vst %%v17,16(%%r1,%2) \n\t"
|
||||
"vst %%v18,32(%%r1,%2) \n\t"
|
||||
"vst %%v19,48(%%r1,%2) \n\t"
|
||||
"vst %%v20,64(%%r1,%2) \n\t"
|
||||
"vst %%v21,80(%%r1,%2) \n\t"
|
||||
"vst %%v22,96(%%r1,%2) \n\t"
|
||||
"vst %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%1) \n\t"
|
||||
"vl %%v17,144(%%r1,%1) \n\t"
|
||||
"vl %%v18,160(%%r1,%1) \n\t"
|
||||
"vl %%v19,176(%%r1,%1) \n\t"
|
||||
"vl %%v20,128(%%r1,%2) \n\t"
|
||||
"vl %%v21,144(%%r1,%2) \n\t"
|
||||
"vl %%v22,160(%%r1,%2) \n\t"
|
||||
"vl %%v23,176(%%r1,%2) \n\t"
|
||||
|
||||
"vfmadb %%v16,%%v0,%%v16,%%v20 \n\t"
|
||||
"vfmadb %%v17,%%v0,%%v17,%%v21 \n\t"
|
||||
"vfmadb %%v18,%%v0,%%v18,%%v22 \n\t"
|
||||
"vfmadb %%v19,%%v0,%%v19,%%v23 \n\t"
|
||||
|
||||
"vl %%v24,192(%%r1,%1) \n\t"
|
||||
"vl %%v25,208(%%r1,%1) \n\t"
|
||||
"vl %%v26,224(%%r1,%1) \n\t"
|
||||
"vl %%v27,240(%%r1,%1) \n\t"
|
||||
"vl %%v28,192(%%r1,%2) \n\t"
|
||||
"vl %%v29,208(%%r1,%2) \n\t"
|
||||
"vl %%v30,224(%%r1,%2) \n\t"
|
||||
"vl %%v31,240(%%r1,%2) \n\t"
|
||||
|
||||
"vfmadb %%v20,%%v0,%%v24,%%v28 \n\t"
|
||||
"vfmadb %%v21,%%v0,%%v25,%%v29 \n\t"
|
||||
"vfmadb %%v22,%%v0,%%v26,%%v30 \n\t"
|
||||
"vfmadb %%v23,%%v0,%%v27,%%v31 \n\t"
|
||||
|
||||
"vst %%v16,128(%%r1,%2) \n\t"
|
||||
"vst %%v17,144(%%r1,%2) \n\t"
|
||||
"vst %%v18,160(%%r1,%2) \n\t"
|
||||
"vst %%v19,176(%%r1,%2) \n\t"
|
||||
"vst %%v20,192(%%r1,%2) \n\t"
|
||||
"vst %%v21,208(%%r1,%2) \n\t"
|
||||
"vst %%v22,224(%%r1,%2) \n\t"
|
||||
"vst %%v23,240(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
|
|
@ -131,7 +132,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
BLASLONG n1 = n & -32;
|
||||
|
||||
if ( n1 )
|
||||
daxpy_kernel_32(n1, x, y , da );
|
||||
daxpy_kernel_32(n1, x, y , &da);
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
|
@ -23,95 +23,28 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(Z13mvc)
|
||||
|
||||
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||
|
||||
__asm__ volatile(
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],5 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"mvc 0(256,%[ptr_y]),0(%[ptr_x]) \n\t"
|
||||
"la %[ptr_x],256(%[ptr_x]) \n\t"
|
||||
"la %[ptr_y],256(%[ptr_y]) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_y] "=m" (*(double (*)[n])y), [n_tmp] "+&r"(n),
|
||||
[ptr_x] "+&a"(x), [ptr_y] "+&a"(y)
|
||||
: [mem_x] "m" (*(const double (*)[n])x)
|
||||
: "cc"
|
||||
);
|
||||
return;
|
||||
|
||||
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"lgr %%r1,%1 \n\t"
|
||||
"lgr %%r2,%2 \n\t"
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1) \n\t"
|
||||
"pfd 2, 1024(%%r2) \n\t"
|
||||
"mvc 0(256,%%r2),0(%%r1) \n\t"
|
||||
"agfi %%r1,256 \n\t"
|
||||
"agfi %%r2,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y)
|
||||
:"memory","cc","r0","r1","r2"
|
||||
);
|
||||
}
|
||||
#else
|
||||
|
||||
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||
|
||||
__asm__ volatile(
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v24, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v25, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v25, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v26, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v26, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v27, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v27, 112(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v24, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v25, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v25, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v26, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v26, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v27, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v27, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_y] "=m" (*(double (*)[n])y), [n_tmp] "+&r"(n)
|
||||
: [mem_x] "m" (*(const double (*)[n])x), [ptr_x] "a"(x), [ptr_y] "a"(y)
|
||||
: "cc", "r1", "v24","v25","v26","v27"
|
||||
);
|
||||
return;
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||
BLASLONG i = 0;
|
||||
|
|
@ -136,21 +69,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
|||
|
||||
} else {
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
|
||||
while (i < n1) {
|
||||
|
||||
y[iy] = x[ix];
|
||||
y[iy + inc_y] = x[ix + inc_x];
|
||||
y[iy + 2 * inc_y] = x[ix + 2 * inc_x];
|
||||
y[iy + 3 * inc_y] = x[ix + 3 * inc_x];
|
||||
|
||||
ix += inc_x * 4;
|
||||
iy += inc_y * 4;
|
||||
i += 4;
|
||||
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
|
||||
y[iy] = x[ix];
|
||||
|
|
@ -165,5 +83,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
|||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -25,116 +25,59 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(Z13)
|
||||
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
FLOAT dot;
|
||||
__asm__ volatile(
|
||||
"pfd 1, 0(%[ptr_x_tmp]) \n\t"
|
||||
"pfd 1, 0(%[ptr_y_tmp]) \n\t"
|
||||
"vzero %%v24 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"vzero %%v26 \n\t"
|
||||
"vzero %%v27 \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
|
||||
"vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
|
||||
"vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
|
||||
|
||||
"vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
|
||||
"vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
|
||||
|
||||
"vl %%v16, 64(%%r1 ,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v19, 112(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
__asm__ volatile (
|
||||
"vzero %%v0 \n\t"
|
||||
"srlg %%r0,%1,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 2,1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
|
||||
"vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
|
||||
|
||||
|
||||
"vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
|
||||
"vl %%v31, 112(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
|
||||
|
||||
|
||||
"la %%r1,128(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b \n\t"
|
||||
"vfadb %%v24,%%v25,%%v24 \n\t"
|
||||
"vfadb %%v24,%%v26,%%v24 \n\t"
|
||||
"vfadb %%v24,%%v27,%%v24 \n\t"
|
||||
"vrepg %%v1,%%v24,1 \n\t"
|
||||
"vfadb %%v1,%%v24,%%v1 \n\t"
|
||||
"ldr %[dot], %%f1 \n\t"
|
||||
: [dot] "=f"(dot) ,[n_tmp] "+&r"(n)
|
||||
: [mem_x] "m"( *(const double (*)[n])x),
|
||||
[mem_y] "m"( *(const double (*)[n])y),
|
||||
[ptr_x_tmp]"a"(x), [ptr_y_tmp] "a"(y)
|
||||
:"cc" , "r1","f1","v16", "v17","v18","v19","v20","v21","v22","v23",
|
||||
"v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
);
|
||||
return dot;
|
||||
"vl %%v24,0(%%r1,%3) \n\t"
|
||||
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,16(%%r1,%3) \n\t"
|
||||
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
|
||||
"vl %%v26,32(%%r1,%3) \n\t"
|
||||
"vfmadb %%v0,%%v18,%%v26,%%v0 \n\t"
|
||||
"vl %%v27,48(%%r1,%3) \n\t"
|
||||
"vfmadb %%v0,%%v19,%%v27,%%v0 \n\t"
|
||||
"vl %%v28,64(%%r1,%3) \n\t"
|
||||
"vfmadb %%v0,%%v20,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,80(%%r1,%3) \n\t"
|
||||
"vfmadb %%v0,%%v21,%%v29,%%v0 \n\t"
|
||||
"vl %%v30,96(%%r1,%3) \n\t"
|
||||
"vfmadb %%v0,%%v22,%%v30,%%v0 \n\t"
|
||||
"vl %%v31,112(%%r1,%3) \n\t"
|
||||
"vfmadb %%v0,%%v23,%%v31,%%v0 \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
"vrepg %%v1,%%v0,1 \n\t"
|
||||
"adbr %%f0,%%f1 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(dot)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
#else
|
||||
|
||||
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y )
|
||||
{
|
||||
BLASLONG register i = 0;
|
||||
FLOAT dot = 0.0;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
dot += y[i] * x[i]
|
||||
+ y[i+1] * x[i+1]
|
||||
+ y[i+2] * x[i+2]
|
||||
+ y[i+3] * x[i+3]
|
||||
+ y[i+4] * x[i+4]
|
||||
+ y[i+5] * x[i+5]
|
||||
+ y[i+6] * x[i+6]
|
||||
+ y[i+7] * x[i+7] ;
|
||||
dot += y[i+8] * x[i+8]
|
||||
+ y[i+9] * x[i+9]
|
||||
+ y[i+10] * x[i+10]
|
||||
+ y[i+11] * x[i+11]
|
||||
+ y[i+12] * x[i+12]
|
||||
+ y[i+13] * x[i+13]
|
||||
+ y[i+14] * x[i+14]
|
||||
+ y[i+15] * x[i+15] ;
|
||||
|
||||
|
||||
i+=16 ;
|
||||
|
||||
}
|
||||
return dot;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
|
|
@ -148,13 +91,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
{
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
|
||||
if ( n1 ){
|
||||
dot = ddot_kernel_16(n1, x, y );
|
||||
i = n1;
|
||||
}
|
||||
|
||||
|
||||
if ( n1 )
|
||||
dot = ddot_kernel_16(n1, x, y);
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
|
|
|
|||
|
|
@ -25,186 +25,392 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define NBMAX 2048
|
||||
|
||||
#define HAVE_KERNEL_4x4_VEC 1
|
||||
#define HAVE_KERNEL_4x2_VEC 1
|
||||
#define HAVE_KERNEL_4x1_VEC 1
|
||||
|
||||
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
|
||||
#include <vecintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_KERNEL_4x4
|
||||
|
||||
#elif HAVE_KERNEL_4x4_VEC
|
||||
|
||||
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT x0,x1,x2,x3;
|
||||
x0 = xo[0] * *alpha;
|
||||
x1 = xo[1] * *alpha;
|
||||
x2 = xo[2] * *alpha;
|
||||
x3 = xo[3] * *alpha;
|
||||
__vector double v_x0 = {x0,x0};
|
||||
__vector double v_x1 = {x1,x1};
|
||||
__vector double v_x2 = {x2,x2};
|
||||
__vector double v_x3 = {x3,x3};
|
||||
__vector double* v_y =(__vector double*)y;
|
||||
__vector double* va0 = (__vector double*)ap[0];
|
||||
__vector double* va1 = (__vector double*)ap[1];
|
||||
__vector double* va2 = (__vector double*)ap[2];
|
||||
__vector double* va3 = (__vector double*)ap[3];
|
||||
__asm__ volatile (
|
||||
"vlrepg %%v0,0(%5) \n\t"
|
||||
"vlrepg %%v1,8(%5) \n\t"
|
||||
"vlrepg %%v2,16(%5) \n\t"
|
||||
"vlrepg %%v3,24(%5) \n\t"
|
||||
"vlrepg %%v4,%7 \n\t"
|
||||
"vfmdb %%v0,%%v0,%%v4 \n\t"
|
||||
"vfmdb %%v1,%%v1,%%v4 \n\t"
|
||||
"vfmdb %%v2,%%v2,%%v4 \n\t"
|
||||
"vfmdb %%v3,%%v3,%%v4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
for ( i=0; i< n/2; i+=2 )
|
||||
{
|
||||
v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ;
|
||||
v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] + v_x2 * va2[i+1] + v_x3 * va3[i+1] ;
|
||||
}
|
||||
}
|
||||
"lghi %%r0,-16 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
#else
|
||||
"srlg %%r0,%%r0,4 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 1,1024(%%r1,%3) \n\t"
|
||||
"pfd 1,1024(%%r1,%4) \n\t"
|
||||
"pfd 2,1024(%%r1,%6) \n\t"
|
||||
|
||||
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a0,*a1,*a2,*a3;
|
||||
FLOAT x[4] __attribute__ ((aligned (16)));
|
||||
a0 = ap[0];
|
||||
a1 = ap[1];
|
||||
a2 = ap[2];
|
||||
a3 = ap[3];
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,0(%%r1,%2) \n\t"
|
||||
"vl %%v18,0(%%r1,%3) \n\t"
|
||||
"vl %%v19,0(%%r1,%4) \n\t"
|
||||
"vl %%v20,16(%%r1,%1) \n\t"
|
||||
"vl %%v21,16(%%r1,%2) \n\t"
|
||||
"vl %%v22,16(%%r1,%3) \n\t"
|
||||
"vl %%v23,16(%%r1,%4) \n\t"
|
||||
"vl %%v24,32(%%r1,%1) \n\t"
|
||||
"vl %%v25,32(%%r1,%2) \n\t"
|
||||
"vl %%v26,32(%%r1,%3) \n\t"
|
||||
"vl %%v27,32(%%r1,%4) \n\t"
|
||||
"vl %%v28,48(%%r1,%1) \n\t"
|
||||
"vl %%v29,48(%%r1,%2) \n\t"
|
||||
"vl %%v30,48(%%r1,%3) \n\t"
|
||||
"vl %%v31,48(%%r1,%4) \n\t"
|
||||
|
||||
for ( i=0; i<4; i++)
|
||||
x[i] = xo[i] * *alpha;
|
||||
"vl %%v4,0(%%r1,%6) \n\t"
|
||||
"vfmadb %%v4,%%v16,%%v0,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v17,%%v1,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v18,%%v2,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v19,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,0(%%r1,%6) \n\t"
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
|
||||
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
|
||||
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
|
||||
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
|
||||
}
|
||||
}
|
||||
"vl %%v4,16(%%r1,%6) \n\t"
|
||||
"vfmadb %%v4,%%v20,%%v0,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v21,%%v1,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v22,%%v2,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v23,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,16(%%r1,%6) \n\t"
|
||||
|
||||
"vl %%v4,32(%%r1,%6) \n\t"
|
||||
"vfmadb %%v4,%%v24,%%v0,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v25,%%v1,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v26,%%v2,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v27,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,32(%%r1,%6) \n\t"
|
||||
|
||||
#endif
|
||||
"vl %%v4,48(%%r1,%6) \n\t"
|
||||
"vfmadb %%v4,%%v28,%%v0,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v29,%%v1,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v30,%%v2,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v31,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,48(%%r1,%6) \n\t"
|
||||
|
||||
#ifdef HAVE_KERNEL_4x2
|
||||
"vl %%v16,64(%%r1,%1) \n\t"
|
||||
"vl %%v17,64(%%r1,%2) \n\t"
|
||||
"vl %%v18,64(%%r1,%3) \n\t"
|
||||
"vl %%v19,64(%%r1,%4) \n\t"
|
||||
"vl %%v20,80(%%r1,%1) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,80(%%r1,%3) \n\t"
|
||||
"vl %%v23,80(%%r1,%4) \n\t"
|
||||
"vl %%v24,96(%%r1,%1) \n\t"
|
||||
"vl %%v25,96(%%r1,%2) \n\t"
|
||||
"vl %%v26,96(%%r1,%3) \n\t"
|
||||
"vl %%v27,96(%%r1,%4) \n\t"
|
||||
"vl %%v28,112(%%r1,%1) \n\t"
|
||||
"vl %%v29,112(%%r1,%2) \n\t"
|
||||
"vl %%v30,112(%%r1,%3) \n\t"
|
||||
"vl %%v31,112(%%r1,%4) \n\t"
|
||||
|
||||
#elif HAVE_KERNEL_4x2_VEC
|
||||
"vl %%v4,64(%%r1,%6) \n\t"
|
||||
"vfmadb %%v4,%%v16,%%v0,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v17,%%v1,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v18,%%v2,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v19,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,64(%%r1,%6) \n\t"
|
||||
|
||||
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT x0,x1;
|
||||
x0 = xo[0] * *alpha;
|
||||
x1 = xo[1] * *alpha;
|
||||
__vector double v_x0 = {x0,x0};
|
||||
__vector double v_x1 = {x1,x1};
|
||||
__vector double* v_y =(__vector double*)y;
|
||||
__vector double* va0 = (__vector double*)ap[0];
|
||||
__vector double* va1 = (__vector double*)ap[1];
|
||||
"vl %%v4,80(%%r1,%6) \n\t"
|
||||
"vfmadb %%v4,%%v20,%%v0,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v21,%%v1,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v22,%%v2,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v23,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,80(%%r1,%6) \n\t"
|
||||
|
||||
for ( i=0; i< n/2; i+=2 )
|
||||
{
|
||||
v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ;
|
||||
v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ;
|
||||
}
|
||||
}
|
||||
#else
|
||||
"vl %%v4,96(%%r1,%6) \n\t"
|
||||
"vfmadb %%v4,%%v24,%%v0,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v25,%%v1,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v26,%%v2,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v27,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,96(%%r1,%6) \n\t"
|
||||
|
||||
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a0,*a1;
|
||||
FLOAT x[4] __attribute__ ((aligned (16)));
|
||||
a0 = ap[0];
|
||||
a1 = ap[1];
|
||||
|
||||
for ( i=0; i<2; i++)
|
||||
x[i] = xo[i] * *alpha;
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
y[i] += a0[i]*x[0] + a1[i]*x[1];
|
||||
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1];
|
||||
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1];
|
||||
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_KERNEL_4x1
|
||||
|
||||
#elif HAVE_KERNEL_4x1_VEC
|
||||
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT x0;
|
||||
x0 = xo[0] * *alpha;
|
||||
__vector double v_x0 = {x0,x0};
|
||||
__vector double* v_y =(__vector double*)y;
|
||||
__vector double* va0 = (__vector double*)ap;
|
||||
|
||||
for ( i=0; i< n/2; i+=2 )
|
||||
{
|
||||
v_y[i] += v_x0 * va0[i] ;
|
||||
v_y[i+1] += v_x0 * va0[i+1] ;
|
||||
}
|
||||
"vl %%v4,112(%%r1,%6) \n\t"
|
||||
"vfmadb %%v4,%%v28,%%v0,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v29,%%v1,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v30,%%v2,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v31,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,112(%%r1,%6) \n\t"
|
||||
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,12 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,0(%%r1,%2) \n\t"
|
||||
"vl %%v18,0(%%r1,%3) \n\t"
|
||||
"vl %%v19,0(%%r1,%4) \n\t"
|
||||
"vl %%v20,16(%%r1,%1) \n\t"
|
||||
"vl %%v21,16(%%r1,%2) \n\t"
|
||||
"vl %%v22,16(%%r1,%3) \n\t"
|
||||
"vl %%v23,16(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v4,0(%%r1,%6) \n\t"
|
||||
"vfmadb %%v4,%%v16,%%v0,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v17,%%v1,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v18,%%v2,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v19,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,0(%%r1,%6) \n\t"
|
||||
|
||||
"vl %%v4,16(%%r1,%6) \n\t"
|
||||
"vfmadb %%v4,%%v20,%%v0,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v21,%%v1,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v22,%%v2,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v23,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,16(%%r1,%6) \n\t"
|
||||
|
||||
"agfi %%r1,32 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"nop "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
#else
|
||||
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a0;
|
||||
FLOAT x[4] __attribute__ ((aligned (16)));
|
||||
a0 = ap;
|
||||
__asm__ volatile (
|
||||
"vlrepg %%v0,0(%3) \n\t"
|
||||
"vlrepg %%v1,8(%3) \n\t"
|
||||
"vlrepg %%v2,%5 \n\t"
|
||||
"vfmdb %%v0,%%v0,%%v2 \n\t"
|
||||
"vfmdb %%v1,%%v1,%%v2 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
for ( i=0; i<1; i++)
|
||||
x[i] = xo[i] * *alpha;
|
||||
"lghi %%r0,-16 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
y[i] += a0[i]*x[0];
|
||||
y[i+1] += a0[i+1]*x[0];
|
||||
y[i+2] += a0[i+2]*x[0];
|
||||
y[i+3] += a0[i+3]*x[0];
|
||||
}
|
||||
"srlg %%r0,%%r0,4 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 2,1024(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,0(%%r1,%2) \n\t"
|
||||
"vl %%v18,16(%%r1,%1) \n\t"
|
||||
"vl %%v19,16(%%r1,%2) \n\t"
|
||||
"vl %%v20,32(%%r1,%1) \n\t"
|
||||
"vl %%v21,32(%%r1,%2) \n\t"
|
||||
"vl %%v22,48(%%r1,%1) \n\t"
|
||||
"vl %%v23,48(%%r1,%2) \n\t"
|
||||
"vl %%v24,64(%%r1,%1) \n\t"
|
||||
"vl %%v25,64(%%r1,%2) \n\t"
|
||||
"vl %%v26,80(%%r1,%1) \n\t"
|
||||
"vl %%v27,80(%%r1,%2) \n\t"
|
||||
"vl %%v28,96(%%r1,%1) \n\t"
|
||||
"vl %%v29,96(%%r1,%2) \n\t"
|
||||
"vl %%v30,112(%%r1,%1) \n\t"
|
||||
"vl %%v31,112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v2,0(%%r1,%4) \n\t"
|
||||
"vfmadb %%v2,%%v16,%%v0,%%v2 \n\t"
|
||||
"vfmadb %%v2,%%v17,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,0(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,16(%%r1,%4) \n\t"
|
||||
"vfmadb %%v2,%%v18,%%v0,%%v2 \n\t"
|
||||
"vfmadb %%v2,%%v19,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,16(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,32(%%r1,%4) \n\t"
|
||||
"vfmadb %%v2,%%v20,%%v0,%%v2 \n\t"
|
||||
"vfmadb %%v2,%%v21,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,32(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,48(%%r1,%4) \n\t"
|
||||
"vfmadb %%v2,%%v22,%%v0,%%v2 \n\t"
|
||||
"vfmadb %%v2,%%v23,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,48(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,64(%%r1,%4) \n\t"
|
||||
"vfmadb %%v2,%%v24,%%v0,%%v2 \n\t"
|
||||
"vfmadb %%v2,%%v25,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,64(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,80(%%r1,%4) \n\t"
|
||||
"vfmadb %%v2,%%v26,%%v0,%%v2 \n\t"
|
||||
"vfmadb %%v2,%%v27,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,80(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,96(%%r1,%4) \n\t"
|
||||
"vfmadb %%v2,%%v28,%%v0,%%v2 \n\t"
|
||||
"vfmadb %%v2,%%v29,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,96(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,112(%%r1,%4) \n\t"
|
||||
"vfmadb %%v2,%%v30,%%v0,%%v2 \n\t"
|
||||
"vfmadb %%v2,%%v31,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,112(%%r1,%4) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,12 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,0(%%r1,%2) \n\t"
|
||||
"vl %%v18,16(%%r1,%1) \n\t"
|
||||
"vl %%v19,16(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v2,0(%%r1,%4) \n\t"
|
||||
"vfmadb %%v2,%%v16,%%v0,%%v2 \n\t"
|
||||
"vfmadb %%v2,%%v17,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,0(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,16(%%r1,%4) \n\t"
|
||||
"vfmadb %%v2,%%v18,%%v0,%%v2 \n\t"
|
||||
"vfmadb %%v2,%%v19,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,16(%%r1,%4) \n\t"
|
||||
|
||||
"agfi %%r1,32 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"nop "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vlrepg %%v0,0(%2) \n\t"
|
||||
"vlrepg %%v1,%4 \n\t"
|
||||
"vfmdb %%v0,%%v0,%%v1 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
#endif
|
||||
"lghi %%r0,-16 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
|
||||
"srlg %%r0,%%r0,4 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 2,1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,16(%%r1,%1) \n\t"
|
||||
"vl %%v18,32(%%r1,%1) \n\t"
|
||||
"vl %%v19,48(%%r1,%1) \n\t"
|
||||
"vl %%v20,64(%%r1,%1) \n\t"
|
||||
"vl %%v21,80(%%r1,%1) \n\t"
|
||||
"vl %%v22,96(%%r1,%1) \n\t"
|
||||
"vl %%v23,112(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v1,0(%%r1,%3) \n\t"
|
||||
"vfmadb %%v1,%%v16,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,0(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,16(%%r1,%3) \n\t"
|
||||
"vfmadb %%v1,%%v17,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,16(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,32(%%r1,%3) \n\t"
|
||||
"vfmadb %%v1,%%v18,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,32(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,48(%%r1,%3) \n\t"
|
||||
"vfmadb %%v1,%%v19,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,48(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,64(%%r1,%3) \n\t"
|
||||
"vfmadb %%v1,%%v20,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,64(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,80(%%r1,%3) \n\t"
|
||||
"vfmadb %%v1,%%v21,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,80(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,96(%%r1,%3) \n\t"
|
||||
"vfmadb %%v1,%%v22,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,96(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,112(%%r1,%3) \n\t"
|
||||
"vfmadb %%v1,%%v23,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,112(%%r1,%3) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,12 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,16(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v1,0(%%r1,%3) \n\t"
|
||||
"vfmadb %%v1,%%v16,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,0(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,16(%%r1,%3) \n\t"
|
||||
"vfmadb %%v1,%%v17,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,16(%%r1,%3) \n\t"
|
||||
|
||||
"agfi %%r1,32 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"nop "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
{
|
||||
BLASLONG i;
|
||||
|
||||
for ( i=0; i<n; i++ ){
|
||||
*dest += *src;
|
||||
src++;
|
||||
dest += inc_dest;
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
*dest += src[i];
|
||||
dest += inc_dest;
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
|
|
|
|||
|
|
@ -25,178 +25,460 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define HAVE_KERNEL_4x4_VEC 1
|
||||
#define HAVE_KERNEL_4x2_VEC 1
|
||||
#define HAVE_KERNEL_4x1_VEC 1
|
||||
|
||||
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
|
||||
#include <vecintrin.h>
|
||||
#endif
|
||||
#define NBMAX 2048
|
||||
|
||||
#ifdef HAVE_KERNEL_4x4
|
||||
|
||||
#elif HAVE_KERNEL_4x4_VEC
|
||||
|
||||
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
BLASLONG i;
|
||||
__vector double* va0 = (__vector double*)ap[0];
|
||||
__vector double* va1 = (__vector double*)ap[1];
|
||||
__vector double* va2 = (__vector double*)ap[2];
|
||||
__vector double* va3 = (__vector double*)ap[3];
|
||||
__vector double* v_x =(__vector double*)x;
|
||||
__vector double temp0 = {0,0};
|
||||
__vector double temp1 = {0,0};
|
||||
__vector double temp2 = {0,0};
|
||||
__vector double temp3 = {0,0};
|
||||
__asm__ volatile (
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"vzero %%v2 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
for ( i=0; i< n/2; i+=2 )
|
||||
{
|
||||
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ;
|
||||
temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ;
|
||||
temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1] ;
|
||||
temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1] ;
|
||||
}
|
||||
|
||||
y[0] = temp0[0] + temp0[1];
|
||||
y[1] = temp1[0] + temp1[1];
|
||||
y[2] = temp2[0] + temp2[1];
|
||||
y[3] = temp3[0] + temp3[1];;
|
||||
"lghi %%r0,-16 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,4 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 1,1024(%%r1,%3) \n\t"
|
||||
"pfd 1,1024(%%r1,%4) \n\t"
|
||||
"pfd 1,1024(%%r1,%5) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%5) \n\t"
|
||||
"vl %%v17,16(%%r1,%5) \n\t"
|
||||
"vl %%v18,32(%%r1,%5) \n\t"
|
||||
"vl %%v19,48(%%r1,%5) \n\t"
|
||||
"vl %%v20,64(%%r1,%5) \n\t"
|
||||
"vl %%v21,80(%%r1,%5) \n\t"
|
||||
"vl %%v22,96(%%r1,%5) \n\t"
|
||||
"vl %%v23,112(%%r1,%5) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,0(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v16,%%v25,%%v1 \n\t"
|
||||
"vl %%v26,0(%%r1,%3) \n\t"
|
||||
"vfmadb %%v2,%%v16,%%v26,%%v2 \n\t"
|
||||
"vl %%v27,0(%%r1,%4) \n\t"
|
||||
"vfmadb %%v3,%%v16,%%v27,%%v3 \n\t"
|
||||
|
||||
"vl %%v28,16(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v17,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,16(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v17,%%v29,%%v1 \n\t"
|
||||
"vl %%v30,16(%%r1,%3) \n\t"
|
||||
"vfmadb %%v2,%%v17,%%v30,%%v2 \n\t"
|
||||
"vl %%v31,16(%%r1,%4) \n\t"
|
||||
"vfmadb %%v3,%%v17,%%v31,%%v3 \n\t"
|
||||
|
||||
"vl %%v24,32(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v18,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,32(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v18,%%v25,%%v1 \n\t"
|
||||
"vl %%v26,32(%%r1,%3) \n\t"
|
||||
"vfmadb %%v2,%%v18,%%v26,%%v2 \n\t"
|
||||
"vl %%v27,32(%%r1,%4) \n\t"
|
||||
"vfmadb %%v3,%%v18,%%v27,%%v3 \n\t"
|
||||
|
||||
"vl %%v28,48(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v19,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,48(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v19,%%v29,%%v1 \n\t"
|
||||
"vl %%v30,48(%%r1,%3) \n\t"
|
||||
"vfmadb %%v2,%%v19,%%v30,%%v2 \n\t"
|
||||
"vl %%v31,48(%%r1,%4) \n\t"
|
||||
"vfmadb %%v3,%%v19,%%v31,%%v3 \n\t"
|
||||
|
||||
"vl %%v24,64(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v20,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,64(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v20,%%v25,%%v1 \n\t"
|
||||
"vl %%v26,64(%%r1,%3) \n\t"
|
||||
"vfmadb %%v2,%%v20,%%v26,%%v2 \n\t"
|
||||
"vl %%v27,64(%%r1,%4) \n\t"
|
||||
"vfmadb %%v3,%%v20,%%v27,%%v3 \n\t"
|
||||
|
||||
"vl %%v28,80(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v21,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,80(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v21,%%v29,%%v1 \n\t"
|
||||
"vl %%v30,80(%%r1,%3) \n\t"
|
||||
"vfmadb %%v2,%%v21,%%v30,%%v2 \n\t"
|
||||
"vl %%v31,80(%%r1,%4) \n\t"
|
||||
"vfmadb %%v3,%%v21,%%v31,%%v3 \n\t"
|
||||
|
||||
"vl %%v24,96(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v22,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,96(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v22,%%v25,%%v1 \n\t"
|
||||
"vl %%v26,96(%%r1,%3) \n\t"
|
||||
"vfmadb %%v2,%%v22,%%v26,%%v2 \n\t"
|
||||
"vl %%v27,96(%%r1,%4) \n\t"
|
||||
"vfmadb %%v3,%%v22,%%v27,%%v3 \n\t"
|
||||
|
||||
"vl %%v28,112(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v23,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,112(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v23,%%v29,%%v1 \n\t"
|
||||
"vl %%v30,112(%%r1,%3) \n\t"
|
||||
"vfmadb %%v2,%%v23,%%v30,%%v2 \n\t"
|
||||
"vl %%v31,112(%%r1,%4) \n\t"
|
||||
"vfmadb %%v3,%%v23,%%v31,%%v3 \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,12 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%5) \n\t"
|
||||
"vl %%v17,16(%%r1,%5) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,0(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v16,%%v25,%%v1 \n\t"
|
||||
"vl %%v26,0(%%r1,%3) \n\t"
|
||||
"vfmadb %%v2,%%v16,%%v26,%%v2 \n\t"
|
||||
"vl %%v27,0(%%r1,%4) \n\t"
|
||||
"vfmadb %%v3,%%v16,%%v27,%%v3 \n\t"
|
||||
|
||||
"vl %%v28,16(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v17,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,16(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v17,%%v29,%%v1 \n\t"
|
||||
"vl %%v30,16(%%r1,%3) \n\t"
|
||||
"vfmadb %%v2,%%v17,%%v30,%%v2 \n\t"
|
||||
"vl %%v31,16(%%r1,%4) \n\t"
|
||||
"vfmadb %%v3,%%v17,%%v31,%%v3 \n\t"
|
||||
|
||||
"agfi %%r1,32 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"vrepg %%v4,%%v0,1 \n\t"
|
||||
"adbr %%f0,%%f4 \n\t"
|
||||
"std %%f0,0(%6) \n\t"
|
||||
"vrepg %%v4,%%v1,1 \n\t"
|
||||
"adbr %%f1,%%f4 \n\t"
|
||||
"std %%f1,8(%6) \n\t"
|
||||
"vrepg %%v4,%%v2,1 \n\t"
|
||||
"adbr %%f2,%%f4 \n\t"
|
||||
"std %%f2,16(%6) \n\t"
|
||||
"vrepg %%v4,%%v3,1 \n\t"
|
||||
"adbr %%f3,%%f4 \n\t"
|
||||
"std %%f3,24(%6) "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
#else
|
||||
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a0,*a1,*a2,*a3;
|
||||
a0 = ap[0];
|
||||
a1 = ap[1];
|
||||
a2 = ap[2];
|
||||
a3 = ap[3];
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
FLOAT temp2 = 0.0;
|
||||
FLOAT temp3 = 0.0;
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
|
||||
temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];
|
||||
temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];
|
||||
temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];
|
||||
}
|
||||
y[0] = temp0;
|
||||
y[1] = temp1;
|
||||
y[2] = temp2;
|
||||
y[3] = temp3;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_KERNEL_4x2
|
||||
|
||||
#elif HAVE_KERNEL_4x2_VEC
|
||||
|
||||
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
BLASLONG i;
|
||||
__vector double* va0 = (__vector double*)ap[0];
|
||||
__vector double* va1 = (__vector double*)ap[1];
|
||||
__vector double* v_x =(__vector double*)x;
|
||||
__vector double temp0 = {0,0};
|
||||
__vector double temp1 = {0,0};
|
||||
__asm__ volatile (
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
for ( i=0; i< n/2; i+=2 )
|
||||
{
|
||||
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ;
|
||||
temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ;
|
||||
}
|
||||
|
||||
y[0] = temp0[0] + temp0[1];
|
||||
y[1] = temp1[0] + temp1[1];
|
||||
"lghi %%r0,-16 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,4 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 1,1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v17,16(%%r1,%3) \n\t"
|
||||
"vl %%v18,32(%%r1,%3) \n\t"
|
||||
"vl %%v19,48(%%r1,%3) \n\t"
|
||||
"vl %%v20,64(%%r1,%3) \n\t"
|
||||
"vl %%v21,80(%%r1,%3) \n\t"
|
||||
"vl %%v22,96(%%r1,%3) \n\t"
|
||||
"vl %%v23,112(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,0(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v16,%%v25,%%v1 \n\t"
|
||||
|
||||
"vl %%v26,16(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v17,%%v26,%%v0 \n\t"
|
||||
"vl %%v27,16(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v17,%%v27,%%v1 \n\t"
|
||||
|
||||
"vl %%v28,32(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v18,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,32(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v18,%%v29,%%v1 \n\t"
|
||||
|
||||
"vl %%v30,48(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v19,%%v30,%%v0 \n\t"
|
||||
"vl %%v31,48(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v19,%%v31,%%v1 \n\t"
|
||||
|
||||
"vl %%v24,64(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v20,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,64(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v20,%%v25,%%v1 \n\t"
|
||||
|
||||
"vl %%v26,80(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v21,%%v26,%%v0 \n\t"
|
||||
"vl %%v27,80(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v21,%%v27,%%v1 \n\t"
|
||||
|
||||
"vl %%v28,96(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v22,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,96(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v22,%%v29,%%v1 \n\t"
|
||||
|
||||
"vl %%v30,112(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v23,%%v30,%%v0 \n\t"
|
||||
"vl %%v31,112(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v23,%%v31,%%v1 \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,12 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v17,16(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,0(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v16,%%v25,%%v1 \n\t"
|
||||
|
||||
"vl %%v26,16(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v17,%%v26,%%v0 \n\t"
|
||||
"vl %%v27,16(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v17,%%v27,%%v1 \n\t"
|
||||
|
||||
"agfi %%r1,32 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"vrepg %%v2,%%v0,1 \n\t"
|
||||
"adbr %%f0,%%f2 \n\t"
|
||||
"std %%f0,0(%4) \n\t"
|
||||
"vrepg %%v2,%%v1,1 \n\t"
|
||||
"adbr %%f1,%%f2 \n\t"
|
||||
"std %%f1,8(%4) "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
#else
|
||||
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *a0,*a1;
|
||||
a0 = ap[0];
|
||||
a1 = ap[1];
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
|
||||
temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];
|
||||
}
|
||||
y[0] = temp0;
|
||||
y[1] = temp1;
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_KERNEL_4x1
|
||||
|
||||
#elif HAVE_KERNEL_4x1_VEC
|
||||
|
||||
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
BLASLONG i;
|
||||
__vector double* va0 = (__vector double*)a0;
|
||||
__vector double* v_x =(__vector double*)x;
|
||||
__vector double temp0 = {0,0};
|
||||
__asm__ volatile (
|
||||
"vzero %%v0 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
for ( i=0; i< n/2; i+=2 )
|
||||
{
|
||||
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ;
|
||||
}
|
||||
|
||||
y[0] = temp0[0] + temp0[1];
|
||||
}
|
||||
#else
|
||||
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
BLASLONG i;
|
||||
|
||||
|
||||
FLOAT temp0 = 0.0;
|
||||
"lghi %%r0,-16 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
|
||||
}
|
||||
y[0] = temp0;
|
||||
"srlg %%r0,%%r0,4 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
|
||||
"vl %%v25,16(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
|
||||
|
||||
"vl %%v26,32(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v18,%%v26,%%v0 \n\t"
|
||||
|
||||
"vl %%v27,48(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v19,%%v27,%%v0 \n\t"
|
||||
|
||||
"vl %%v28,64(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v20,%%v28,%%v0 \n\t"
|
||||
|
||||
"vl %%v29,80(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v21,%%v29,%%v0 \n\t"
|
||||
|
||||
"vl %%v30,96(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v22,%%v30,%%v0 \n\t"
|
||||
|
||||
"vl %%v31,112(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v23,%%v31,%%v0 \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,12 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
|
||||
"vl %%v25,16(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
|
||||
|
||||
"agfi %%r1,32 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"vrepg %%v1,%%v0,1 \n\t"
|
||||
"adbr %%f0,%%f1 \n\t"
|
||||
"std %%f0,0(%3) "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
|
||||
{
|
||||
BLASLONG i;
|
||||
for ( i=0; i<n; i++ )
|
||||
{
|
||||
*dest = *src;
|
||||
dest++;
|
||||
src += inc_src;
|
||||
}
|
||||
BLASLONG i;
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
dest[i] = *src;
|
||||
src += inc_src;
|
||||
}
|
||||
}
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vlrepg %%v0,%1 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
"lghi %%r0,-16 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,4 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 2,1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 0(%%r1,%3) \n\t"
|
||||
"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t"
|
||||
"vst %%v24, 0(%%r1,%3) \n\t"
|
||||
"vl %%v25, 16(%%r1,%3) \n\t"
|
||||
"vfmadb %%v25,%%v17,%%v0,%%v25 \n\t"
|
||||
"vst %%v25, 16(%%r1,%3) \n\t"
|
||||
"vl %%v26, 32(%%r1,%3) \n\t"
|
||||
"vfmadb %%v26,%%v18,%%v0,%%v26 \n\t"
|
||||
"vst %%v26, 32(%%r1,%3) \n\t"
|
||||
"vl %%v27, 48(%%r1,%3) \n\t"
|
||||
"vfmadb %%v27,%%v19,%%v0,%%v27 \n\t"
|
||||
"vst %%v27, 48(%%r1,%3) \n\t"
|
||||
"vl %%v28, 64(%%r1,%3) \n\t"
|
||||
"vfmadb %%v28,%%v20,%%v0,%%v28 \n\t"
|
||||
"vst %%v28, 64(%%r1,%3) \n\t"
|
||||
"vl %%v29, 80(%%r1,%3) \n\t"
|
||||
"vfmadb %%v29,%%v21,%%v0,%%v29 \n\t"
|
||||
"vst %%v29, 80(%%r1,%3) \n\t"
|
||||
"vl %%v30, 96(%%r1,%3) \n\t"
|
||||
"vfmadb %%v30,%%v22,%%v0,%%v30 \n\t"
|
||||
"vst %%v30, 96(%%r1,%3) \n\t"
|
||||
"vl %%v31, 112(%%r1,%3) \n\t"
|
||||
"vfmadb %%v31,%%v23,%%v0,%%v31 \n\t"
|
||||
"vst %%v31, 112(%%r1,%3) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,12 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 0(%%r1,%3) \n\t"
|
||||
"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t"
|
||||
"vst %%v24, 0(%%r1,%3) \n\t"
|
||||
"vl %%v25, 16(%%r1,%3) \n\t"
|
||||
"vfmadb %%v25,%%v17,%%v0,%%v25 \n\t"
|
||||
"vst %%v25, 16(%%r1,%3) \n\t"
|
||||
|
||||
"agfi %%r1,32 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"nop "
|
||||
:
|
||||
:"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
{
|
||||
if (inc_dest == 1)
|
||||
add_y_kernel_4(n, da, src, dest);
|
||||
else
|
||||
{
|
||||
BLASLONG i;
|
||||
|
||||
for ( i=0; i<n; i++ )
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
*dest += src[i] * da;
|
||||
dest += inc_dest;
|
||||
*dest += src[i] * da;
|
||||
dest += inc_dest;
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
|
|
@ -212,7 +494,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
|||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
FLOAT ybuffer[4],*xbuffer;
|
||||
FLOAT ybuffer[2] __attribute__ ((aligned(16)));
|
||||
FLOAT *xbuffer;
|
||||
FLOAT *ytemp;
|
||||
|
||||
if ( m < 1 ) return(0);
|
||||
|
|
@ -234,7 +517,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
|||
|
||||
while ( NB == NBMAX )
|
||||
{
|
||||
|
||||
m1 -= NB;
|
||||
if ( m1 < 0)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -0,0 +1,182 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT max;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%2) \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v16,%%v17 \n\t"
|
||||
"vfchdb %%v25,%%v18,%%v19 \n\t"
|
||||
"vfchdb %%v26,%%v20,%%v21 \n\t"
|
||||
"vfchdb %%v27,%%v22,%%v23 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v28,%%v24,%%v25 \n\t"
|
||||
"vfchdb %%v29,%%v26,%%v27 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchdb %%v30,%%v28,%%v29 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchdb %%v31,%%v30,%%v0 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%2) \n\t"
|
||||
"vl %%v17,144(%%r1,%2) \n\t"
|
||||
"vl %%v18,160(%%r1,%2) \n\t"
|
||||
"vl %%v19,176(%%r1,%2) \n\t"
|
||||
"vl %%v20,192(%%r1,%2) \n\t"
|
||||
"vl %%v21,208(%%r1,%2) \n\t"
|
||||
"vl %%v22,224(%%r1,%2) \n\t"
|
||||
"vl %%v23,240(%%r1,%2) \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v16,%%v17 \n\t"
|
||||
"vfchdb %%v25,%%v18,%%v19 \n\t"
|
||||
"vfchdb %%v26,%%v20,%%v21 \n\t"
|
||||
"vfchdb %%v27,%%v22,%%v23 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v28,%%v24,%%v25 \n\t"
|
||||
"vfchdb %%v29,%%v26,%%v27 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchdb %%v30,%%v28,%%v29 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchdb %%v31,%%v30,%%v0 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v16,%%v0,1 \n\t"
|
||||
"wfchdb %%v17,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(max)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return max;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
maxf = dmax_kernel_32(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
maxf=x[0];
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (x[i] > maxf) {
|
||||
maxf = x[i];
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (maxf);
|
||||
|
||||
} else {
|
||||
|
||||
maxf=x[0];
|
||||
i += inc_x;
|
||||
j++;
|
||||
|
||||
BLASLONG n1 = (n - 1) & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (x[i] > maxf) {
|
||||
maxf = x[i];
|
||||
}
|
||||
if (x[i + inc_x] > maxf) {
|
||||
maxf = x[i + inc_x];
|
||||
}
|
||||
if (x[i + 2 * inc_x] > maxf) {
|
||||
maxf = x[i + 2 * inc_x];
|
||||
}
|
||||
if (x[i + 3 * inc_x] > maxf) {
|
||||
maxf = x[i + 3 * inc_x];
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (x[i] > maxf) {
|
||||
maxf = x[i];
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (maxf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,182 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT min;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%2) \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v17,%%v16 \n\t"
|
||||
"vfchdb %%v25,%%v19,%%v18 \n\t"
|
||||
"vfchdb %%v26,%%v21,%%v20 \n\t"
|
||||
"vfchdb %%v27,%%v23,%%v22 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v28,%%v25,%%v24 \n\t"
|
||||
"vfchdb %%v29,%%v27,%%v26 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchdb %%v30,%%v29,%%v28 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchdb %%v31,%%v0,%%v30 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%2) \n\t"
|
||||
"vl %%v17,144(%%r1,%2) \n\t"
|
||||
"vl %%v18,160(%%r1,%2) \n\t"
|
||||
"vl %%v19,176(%%r1,%2) \n\t"
|
||||
"vl %%v20,192(%%r1,%2) \n\t"
|
||||
"vl %%v21,208(%%r1,%2) \n\t"
|
||||
"vl %%v22,224(%%r1,%2) \n\t"
|
||||
"vl %%v23,240(%%r1,%2) \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v17,%%v16 \n\t"
|
||||
"vfchdb %%v25,%%v19,%%v18 \n\t"
|
||||
"vfchdb %%v26,%%v21,%%v20 \n\t"
|
||||
"vfchdb %%v27,%%v23,%%v22 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v28,%%v25,%%v24 \n\t"
|
||||
"vfchdb %%v29,%%v27,%%v26 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchdb %%v30,%%v29,%%v28 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchdb %%v31,%%v0,%%v30 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v16,%%v0,1 \n\t"
|
||||
"wfchdb %%v17,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(min)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return min;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT minf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (minf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
minf = dmin_kernel_32(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
minf=x[0];
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (x[i] < minf) {
|
||||
minf = x[i];
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (minf);
|
||||
|
||||
} else {
|
||||
|
||||
minf=x[0];
|
||||
i += inc_x;
|
||||
j++;
|
||||
|
||||
BLASLONG n1 = (n - 1) & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (x[i] < minf) {
|
||||
minf = x[i];
|
||||
}
|
||||
if (x[i + inc_x] < minf) {
|
||||
minf = x[i + inc_x];
|
||||
}
|
||||
if (x[i + 2 * inc_x] < minf) {
|
||||
minf = x[i + 2 * inc_x];
|
||||
}
|
||||
if (x[i + 3 * inc_x] < minf) {
|
||||
minf = x[i + 3 * inc_x];
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (x[i] < minf) {
|
||||
minf = x[i];
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (minf);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
|
@ -27,176 +27,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA)
|
||||
static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||
{
|
||||
__asm__ (
|
||||
"pfd 2, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"lgdr %%r1,%[cos] \n\t"
|
||||
"vlvgp %%v0,%%r1,%%r1 \n\t"
|
||||
"lgdr %%r1,%[sin] \n\t"
|
||||
"vlvgp %%v1,%%r1,%%r1 \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 48(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19, 112(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 176(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19, 240(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 240(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_x] "+m" (*(double (*)[n])x),
|
||||
[mem_y] "+m" (*(double (*)[n])y),
|
||||
[n_tmp] "+&r"(n)
|
||||
: [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA)
|
||||
: "cc", "r1" ,"v0","v1","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return;
|
||||
|
||||
__asm__ (
|
||||
"vlrepg %%v0,%3 \n\t"
|
||||
"vlrepg %%v1,%4 \n\t"
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
"vl %%v24, 0(%%r1,%1) \n\t"
|
||||
"vl %%v25, 16(%%r1,%1) \n\t"
|
||||
"vl %%v26, 32(%%r1,%1) \n\t"
|
||||
"vl %%v27, 48(%%r1,%1) \n\t"
|
||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 0(%%r1,%1) \n\t"
|
||||
"vst %%v29, 16(%%r1,%1) \n\t"
|
||||
"vst %%v30, 32(%%r1,%1) \n\t"
|
||||
"vst %%v31, 48(%%r1,%1) \n\t"
|
||||
"vst %%v20, 0(%%r1,%2) \n\t"
|
||||
"vst %%v21, 16(%%r1,%2) \n\t"
|
||||
"vst %%v22, 32(%%r1,%2) \n\t"
|
||||
"vst %%v23, 48(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 64(%%r1,%1) \n\t"
|
||||
"vl %%v25, 80(%%r1,%1) \n\t"
|
||||
"vl %%v26, 96(%%r1,%1) \n\t"
|
||||
"vl %%v27, 112(%%r1,%1) \n\t"
|
||||
"vl %%v16, 64(%%r1,%2) \n\t"
|
||||
"vl %%v17, 80(%%r1,%2) \n\t"
|
||||
"vl %%v18, 96(%%r1,%2) \n\t"
|
||||
"vl %%v19, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 64(%%r1,%1) \n\t"
|
||||
"vst %%v29, 80(%%r1,%1) \n\t"
|
||||
"vst %%v30, 96(%%r1,%1) \n\t"
|
||||
"vst %%v31, 112(%%r1,%1) \n\t"
|
||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 128(%%r1,%1) \n\t"
|
||||
"vst %%v29, 144(%%r1,%1) \n\t"
|
||||
"vst %%v30, 160(%%r1,%1) \n\t"
|
||||
"vst %%v31, 176(%%r1,%1) \n\t"
|
||||
"vst %%v20, 128(%%r1,%2) \n\t"
|
||||
"vst %%v21, 144(%%r1,%2) \n\t"
|
||||
"vst %%v22, 160(%%r1,%2) \n\t"
|
||||
"vst %%v23, 176(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 192(%%r1,%1) \n\t"
|
||||
"vl %%v25, 208(%%r1,%1) \n\t"
|
||||
"vl %%v26, 224(%%r1,%1) \n\t"
|
||||
"vl %%v27, 240(%%r1,%1) \n\t"
|
||||
"vl %%v16, 192(%%r1,%2) \n\t"
|
||||
"vl %%v17, 208(%%r1,%2) \n\t"
|
||||
"vl %%v18, 224(%%r1,%2) \n\t"
|
||||
"vl %%v19, 240(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 192(%%r1,%1) \n\t"
|
||||
"vst %%v29, 208(%%r1,%1) \n\t"
|
||||
"vst %%v30, 224(%%r1,%1) \n\t"
|
||||
"vst %%v31, 240(%%r1,%1) \n\t"
|
||||
"vst %%v20, 192(%%r1,%2) \n\t"
|
||||
"vst %%v21, 208(%%r1,%2) \n\t"
|
||||
"vst %%v22, 224(%%r1,%2) \n\t"
|
||||
"vst %%v23, 240(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
|
|
@ -214,8 +204,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
|
||||
drot_kernel_32(n1, x, y, c, s);
|
||||
FLOAT cosa,sina;
|
||||
cosa=c;
|
||||
sina=s;
|
||||
drot_kernel_32(n1, x, y, &cosa, &sina);
|
||||
i=n1;
|
||||
}
|
||||
|
||||
|
|
@ -229,6 +221,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
@ -250,3 +243,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -27,135 +27,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef Z13_A
|
||||
static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x )
|
||||
static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vlrepg %%v0,%1 \n\t"
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
"vl %%v24, 0(%%r1,%2) \n\t"
|
||||
"vfmdb %%v24,%%v24,%%v0 \n\t"
|
||||
"vst %%v24, 0(%%r1,%2) \n\t"
|
||||
"vl %%v25, 16(%%r1,%2) \n\t"
|
||||
"vfmdb %%v25,%%v25,%%v0 \n\t"
|
||||
"vst %%v25, 16(%%r1,%2) \n\t"
|
||||
"vl %%v26, 32(%%r1,%2) \n\t"
|
||||
"vfmdb %%v26,%%v26,%%v0 \n\t"
|
||||
"vst %%v26, 32(%%r1,%2) \n\t"
|
||||
"vl %%v27, 48(%%r1,%2) \n\t"
|
||||
"vfmdb %%v27,%%v27,%%v0 \n\t"
|
||||
"vst %%v27, 48(%%r1,%2) \n\t"
|
||||
"vl %%v24, 64(%%r1,%2) \n\t"
|
||||
"vfmdb %%v24,%%v24,%%v0 \n\t"
|
||||
"vst %%v24, 64(%%r1,%2) \n\t"
|
||||
"vl %%v25, 80(%%r1,%2) \n\t"
|
||||
"vfmdb %%v25,%%v25,%%v0 \n\t"
|
||||
"vst %%v25, 80(%%r1,%2) \n\t"
|
||||
"vl %%v26, 96(%%r1,%2) \n\t"
|
||||
"vfmdb %%v26,%%v26,%%v0 \n\t"
|
||||
"vst %%v26, 96(%%r1,%2) \n\t"
|
||||
"vl %%v27, 112(%%r1,%2) \n\t"
|
||||
"vfmdb %%v27,%%v27,%%v0 \n\t"
|
||||
"vst %%v27, 112(%%r1,%2) \n\t"
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v24","v25","v26","v27"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
|
||||
"lgdr %%r0,%[alpha] \n\t"
|
||||
"vlvgp %%v0,%%r0,%%r0 \n\t"
|
||||
"srlg %[n],%[n],4 \n\t"
|
||||
"vlr %%v1,%%v0 \n\t"
|
||||
"vlm %%v16,%%v23, 0(%[x_ptr]) \n\t"
|
||||
"la %[x_ptr], 128(%[x_ptr]) \n\t"
|
||||
"aghik %[n], %[n], -1 \n\t"
|
||||
"jle 2f \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"vfmdb %%v24, %%v16, %%v0 \n\t"
|
||||
"vfmdb %%v25, %%v17, %%v0 \n\t"
|
||||
"vfmdb %%v26, %%v18, %%v0 \n\t"
|
||||
"vfmdb %%v27, %%v19, %%v1 \n\t"
|
||||
"vlm %%v16,%%v19, 0(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v28, %%v20, %%v0 \n\t"
|
||||
"vfmdb %%v29, %%v21, %%v1 \n\t"
|
||||
"vfmdb %%v30, %%v22, %%v0 \n\t"
|
||||
"vfmdb %%v31, %%v23, %%v1 \n\t"
|
||||
"vlm %%v20,%%v23, 64(%[x_ptr]) \n\t"
|
||||
"lay %[x_ptr], -128(%[x_ptr]) \n\t"
|
||||
"vstm %%v24,%%v31, 0(%[x_ptr]) \n\t"
|
||||
"la %[x_ptr],256(%[x_ptr]) \n\t"
|
||||
"brctg %[n],1b \n\t"
|
||||
"2: \n\t"
|
||||
"vfmdb %%v24, %%v16, %%v0 \n\t"
|
||||
"vfmdb %%v25, %%v17, %%v1 \n\t"
|
||||
"vfmdb %%v26, %%v18, %%v0 \n\t"
|
||||
"vfmdb %%v27, %%v19, %%v1 \n\t"
|
||||
"lay %[x_ptr] , -128(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v28, %%v20, %%v0 \n\t"
|
||||
"vfmdb %%v29, %%v21, %%v1 \n\t"
|
||||
"vfmdb %%v30, %%v22, %%v0 \n\t"
|
||||
"vfmdb %%v31, %%v23, %%v1 \n\t"
|
||||
"vstm %%v24,%%v31, 0(%[x_ptr]) \n\t"
|
||||
: [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x),[n] "+&r"(n)
|
||||
: [alpha] "f"(da)
|
||||
:"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21",
|
||||
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
#else
|
||||
static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x )
|
||||
static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vzero %%v24 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"vzero %%v26 \n\t"
|
||||
"vzero %%v27 \n\t"
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
|
||||
/* faster than sequence of triples(vl vfmd vst) (tested OPENBLAS_LOOPS=10000) */
|
||||
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
|
||||
"lgdr %%r0,%[alpha] \n\t"
|
||||
"vlvgp %%v0,%%r0,%%r0 \n\t"
|
||||
"vlr %%v1,%%v0 \n\t"
|
||||
"sllg %%r0,%[n],3 \n\t"
|
||||
"agr %%r0,%[x_ptr] \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%[x_ptr]) \n\t"
|
||||
"vlm %%v16,%%v23, 0(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v16,%%v16,%%v0 \n\t"
|
||||
"vfmdb %%v17,%%v17,%%v1 \n\t"
|
||||
"vfmdb %%v18,%%v18,%%v0 \n\t"
|
||||
"vfmdb %%v19,%%v19,%%v1 \n\t"
|
||||
"vfmdb %%v20,%%v20,%%v0 \n\t"
|
||||
"vfmdb %%v21,%%v21,%%v1 \n\t"
|
||||
"vfmdb %%v22,%%v22,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v23,%%v1 \n\t"
|
||||
"vstm %%v16,%%v23, 0(%[x_ptr]) \n\t"
|
||||
"vlm %%v24,%%v31,128(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v24,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v25,%%v25,%%v1 \n\t"
|
||||
"vfmdb %%v26,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v27,%%v27,%%v1 \n\t"
|
||||
"vfmdb %%v28,%%v28,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v29,%%v1 \n\t"
|
||||
"vfmdb %%v30,%%v30,%%v0 \n\t"
|
||||
"vfmdb %%v31,%%v31,%%v1 \n\t"
|
||||
"vstm %%v24,%%v31,128(%[x_ptr]) \n\t"
|
||||
"la %[x_ptr], 256(%[x_ptr]) \n\t"
|
||||
"clgrjl %[x_ptr],%%r0,1b \n\t"
|
||||
: [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x)
|
||||
: [n] "r"(n),[alpha] "f"(da)
|
||||
:"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21",
|
||||
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
}
|
||||
#endif
|
||||
static void dscal_kernel_32_zero( BLASLONG n, FLOAT *x )
|
||||
{
|
||||
|
||||
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
|
||||
"vzero %%v24 \n\t"
|
||||
"sllg %%r0,%[n],3 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"agr %%r0,%[x_ptr] \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 0(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 16(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 32(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 48(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 64(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 80(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 96(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 112(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 128(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 144(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 160(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 176(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 192(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 208(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 224(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 240(%[x_ptr]) \n\t"
|
||||
"la %[x_ptr],256(%[x_ptr]) \n\t"
|
||||
"clgrjl %[x_ptr],%%r0,1b \n\t"
|
||||
: [mem] "=m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x)
|
||||
: [n] "r"(n)
|
||||
:"cc" , "r0", "v24" ,"v25"
|
||||
);
|
||||
"vst %%v24,0(%%r1,%1) \n\t"
|
||||
"vst %%v25,16(%%r1,%1) \n\t"
|
||||
"vst %%v26,32(%%r1,%1) \n\t"
|
||||
"vst %%v27,48(%%r1,%1) \n\t"
|
||||
"vst %%v24,64(%%r1,%1) \n\t"
|
||||
"vst %%v25,80(%%r1,%1) \n\t"
|
||||
"vst %%v26,96(%%r1,%1) \n\t"
|
||||
"vst %%v27,112(%%r1,%1) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v24","v25","v26","v27"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0,j=0;
|
||||
|
|
@ -169,11 +109,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
if ( da == 0.0 )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
BLASLONG n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
|
||||
dscal_kernel_32_zero(n1 , x);
|
||||
dscal_kernel_16_zero(n1, x);
|
||||
j=n1;
|
||||
}
|
||||
|
||||
|
|
@ -188,10 +128,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
else
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
BLASLONG n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
dscal_kernel_32(n1 , da , x);
|
||||
dscal_kernel_16(n1, da, x);
|
||||
j=n1;
|
||||
}
|
||||
while(j < n)
|
||||
|
|
@ -260,4 +200,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
}
|
||||
return 0;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,180 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2018,The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms,with or without
|
||||
modification,are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice,this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice,this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static double dsdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
double dot;
|
||||
|
||||
__asm__ volatile (
|
||||
"vzero %%v0 \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 2,1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%3) \n\t"
|
||||
"vfmsb %%v16,%%v16,%%v24 \n\t"
|
||||
"vl %%v25,16(%%r1,%3) \n\t"
|
||||
"vfmsb %%v17,%%v17,%%v25 \n\t"
|
||||
"vl %%v26,32(%%r1,%3) \n\t"
|
||||
"vfmsb %%v18,%%v18,%%v26 \n\t"
|
||||
"vl %%v27,48(%%r1,%3) \n\t"
|
||||
"vfmsb %%v19,%%v19,%%v27 \n\t"
|
||||
"vl %%v28,64(%%r1,%3) \n\t"
|
||||
"vfmsb %%v20,%%v20,%%v28 \n\t"
|
||||
"vl %%v29,80(%%r1,%3) \n\t"
|
||||
"vfmsb %%v21,%%v21,%%v29 \n\t"
|
||||
"vl %%v30,96(%%r1,%3) \n\t"
|
||||
"vfmsb %%v22,%%v22,%%v30 \n\t"
|
||||
"vl %%v31,112(%%r1,%3) \n\t"
|
||||
"vfmsb %%v23,%%v23,%%v31 \n\t"
|
||||
|
||||
"vflls %%v24,%%v16 \n\t"
|
||||
"vflls %%v25,%%v17 \n\t"
|
||||
"vflls %%v26,%%v18 \n\t"
|
||||
"vflls %%v27,%%v19 \n\t"
|
||||
"vflls %%v28,%%v20 \n\t"
|
||||
"vflls %%v29,%%v21 \n\t"
|
||||
"vflls %%v30,%%v22 \n\t"
|
||||
"vflls %%v31,%%v23 \n\t"
|
||||
|
||||
"veslg %%v16,%%v16,32 \n\t"
|
||||
"veslg %%v17,%%v17,32 \n\t"
|
||||
"veslg %%v18,%%v18,32 \n\t"
|
||||
"veslg %%v19,%%v19,32 \n\t"
|
||||
"veslg %%v20,%%v20,32 \n\t"
|
||||
"veslg %%v21,%%v21,32 \n\t"
|
||||
"veslg %%v22,%%v22,32 \n\t"
|
||||
"veslg %%v23,%%v23,32 \n\t"
|
||||
|
||||
"vflls %%v16,%%v16 \n\t"
|
||||
"vflls %%v17,%%v17 \n\t"
|
||||
"vflls %%v18,%%v18 \n\t"
|
||||
"vflls %%v19,%%v19 \n\t"
|
||||
"vflls %%v20,%%v20 \n\t"
|
||||
"vflls %%v21,%%v21 \n\t"
|
||||
"vflls %%v22,%%v22 \n\t"
|
||||
"vflls %%v23,%%v23 \n\t"
|
||||
|
||||
"vfadb %%v16,%%v16,%%v24 \n\t"
|
||||
"vfadb %%v17,%%v17,%%v25 \n\t"
|
||||
"vfadb %%v18,%%v18,%%v26 \n\t"
|
||||
"vfadb %%v19,%%v19,%%v27 \n\t"
|
||||
"vfadb %%v20,%%v20,%%v28 \n\t"
|
||||
"vfadb %%v21,%%v21,%%v29 \n\t"
|
||||
"vfadb %%v22,%%v22,%%v30 \n\t"
|
||||
"vfadb %%v23,%%v23,%%v31 \n\t"
|
||||
"vfadb %%v16,%%v16,%%v20 \n\t"
|
||||
"vfadb %%v17,%%v17,%%v21 \n\t"
|
||||
"vfadb %%v18,%%v18,%%v22 \n\t"
|
||||
"vfadb %%v19,%%v19,%%v23 \n\t"
|
||||
"vfadb %%v16,%%v16,%%v18 \n\t"
|
||||
"vfadb %%v17,%%v17,%%v19 \n\t"
|
||||
"vfadb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfadb %%v0,%%v16,%%v0 \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
"vrepg %%v1,%%v0,1 \n\t"
|
||||
"adbr %%f0,%%f1 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(dot)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return dot;
|
||||
}
|
||||
|
||||
double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
double dot = 0.0 ;
|
||||
|
||||
if ( n <= 0 ) return(dot);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
|
||||
if ( n1 )
|
||||
dot = dsdot_kernel_32(n1,x,y);
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[i] * x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(dot);
|
||||
|
||||
|
||||
}
|
||||
|
||||
BLASLONG n1 = n & -2;
|
||||
|
||||
while(i < n1)
|
||||
{
|
||||
|
||||
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
|
||||
ix += inc_x*2 ;
|
||||
iy += inc_y*2 ;
|
||||
i+=2 ;
|
||||
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[iy] * x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(dot);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
|
@ -25,217 +25,93 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
|
||||
#if defined(Z13_SWAP_A)
|
||||
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v16, 0(%%r1,%[ptr_x]) \n\t"
|
||||
__asm__ volatile(
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16, 0(%%r1,%1) \n\t"
|
||||
"vl %%v17, 16(%%r1,%1) \n\t"
|
||||
"vl %%v18, 32(%%r1,%1) \n\t"
|
||||
"vl %%v19, 48(%%r1,%1) \n\t"
|
||||
"vl %%v20, 64(%%r1,%1) \n\t"
|
||||
"vl %%v21, 80(%%r1,%1) \n\t"
|
||||
"vl %%v22, 96(%%r1,%1) \n\t"
|
||||
"vl %%v23, 112(%%r1,%1) \n\t"
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vl %%v28, 192(%%r1,%1) \n\t"
|
||||
"vl %%v29, 208(%%r1,%1) \n\t"
|
||||
"vl %%v30, 224(%%r1,%1) \n\t"
|
||||
"vl %%v31, 240(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v17, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v0, 0(%%r1,%2) \n\t"
|
||||
"vl %%v1, 16(%%r1,%2) \n\t"
|
||||
"vl %%v2, 32(%%r1,%2) \n\t"
|
||||
"vl %%v3, 48(%%r1,%2) \n\t"
|
||||
"vl %%v4, 64(%%r1,%2) \n\t"
|
||||
"vl %%v5, 80(%%r1,%2) \n\t"
|
||||
"vl %%v6, 96(%%r1,%2) \n\t"
|
||||
"vl %%v7, 112(%%r1,%2) \n\t"
|
||||
"vst %%v0, 0(%%r1,%1) \n\t"
|
||||
"vst %%v1, 16(%%r1,%1) \n\t"
|
||||
"vst %%v2, 32(%%r1,%1) \n\t"
|
||||
"vst %%v3, 48(%%r1,%1) \n\t"
|
||||
"vst %%v4, 64(%%r1,%1) \n\t"
|
||||
"vst %%v5, 80(%%r1,%1) \n\t"
|
||||
"vst %%v6, 96(%%r1,%1) \n\t"
|
||||
"vst %%v7, 112(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v18, 32(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v19, 48(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v28, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v20, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v28, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v20, 64(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v29, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v21, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v29, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 80(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v30, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v22, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v30, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 96(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v31, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v23, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v31, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 112(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v16, 128(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v17, 144(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v18, 160(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v19, 176(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v20, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v20, 192(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v21, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 208(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v22, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 224(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v23, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 240(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_x] "+m" (*(double (*)[n])x),
|
||||
[mem_y] "+m" (*(double (*)[n])y),
|
||||
[n_tmp] "+&r"(n)
|
||||
: [ptr_x] "a"(x), [ptr_y] "a"(y)
|
||||
: "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
,"v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return;
|
||||
"vl %%v0, 128(%%r1,%2) \n\t"
|
||||
"vl %%v1, 144(%%r1,%2) \n\t"
|
||||
"vl %%v2, 160(%%r1,%2) \n\t"
|
||||
"vl %%v3, 176(%%r1,%2) \n\t"
|
||||
"vl %%v4, 192(%%r1,%2) \n\t"
|
||||
"vl %%v5, 208(%%r1,%2) \n\t"
|
||||
"vl %%v6, 224(%%r1,%2) \n\t"
|
||||
"vl %%v7, 240(%%r1,%2) \n\t"
|
||||
"vst %%v0, 128(%%r1,%1) \n\t"
|
||||
"vst %%v1, 144(%%r1,%1) \n\t"
|
||||
"vst %%v2, 160(%%r1,%1) \n\t"
|
||||
"vst %%v3, 176(%%r1,%1) \n\t"
|
||||
"vst %%v4, 192(%%r1,%1) \n\t"
|
||||
"vst %%v5, 208(%%r1,%1) \n\t"
|
||||
"vst %%v6, 224(%%r1,%1) \n\t"
|
||||
"vst %%v7, 240(%%r1,%1) \n\t"
|
||||
|
||||
"vst %%v16, 0(%%r1,%2) \n\t"
|
||||
"vst %%v17, 16(%%r1,%2) \n\t"
|
||||
"vst %%v18, 32(%%r1,%2) \n\t"
|
||||
"vst %%v19, 48(%%r1,%2) \n\t"
|
||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
||||
"vst %%v24, 128(%%r1,%2) \n\t"
|
||||
"vst %%v25, 144(%%r1,%2) \n\t"
|
||||
"vst %%v26, 160(%%r1,%2) \n\t"
|
||||
"vst %%v27, 176(%%r1,%2) \n\t"
|
||||
"vst %%v28, 192(%%r1,%2) \n\t"
|
||||
"vst %%v29, 208(%%r1,%2) \n\t"
|
||||
"vst %%v30, 224(%%r1,%2) \n\t"
|
||||
"vst %%v31, 240(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"pfd 2, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v16, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v20, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v21, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v22, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v23, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
|
||||
"vl %%v0, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v1, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v2, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v3, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v4, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v5, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v6, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v7, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v0, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v1, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v2, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v3, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v4, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v5, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v6, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v7, 112(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v0, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v1, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v2, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v3, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v4, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v5, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v6, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v7, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v0, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v1, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v2, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v3, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v4, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v5, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v6, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v7, 240(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vst %%v16, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v17, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v18, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v19, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_x] "+m" (*(double (*)[n])x),
|
||||
[mem_y] "+m" (*(double (*)[n])y),
|
||||
[n_tmp] "+&r"(n)
|
||||
: [ptr_x] "a"(x), [ptr_y] "a"(y)
|
||||
: "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
|
|
@ -284,5 +160,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
|
|||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,319 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
||||
|
||||
static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||
{
|
||||
BLASLONG iamax;
|
||||
|
||||
__asm__ volatile (
|
||||
"vlef %%v0,0(%3),0 \n\t"
|
||||
"vlef %%v1,4(%3),0 \n\t"
|
||||
"vlef %%v0,8(%3),1 \n\t"
|
||||
"vlef %%v1,12(%3),1 \n\t"
|
||||
"vlef %%v0,16(%3),2 \n\t"
|
||||
"vlef %%v1,20(%3),2 \n\t"
|
||||
"vlef %%v0,24(%3),3 \n\t"
|
||||
"vlef %%v1,28(%3),3 \n\t"
|
||||
"vflpsb %%v0,%%v0 \n\t"
|
||||
"vflpsb %%v1,%%v1 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v1 \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,2,1 \n\t"
|
||||
"vleig %%v2,1,0 \n\t"
|
||||
"vleig %%v2,3,1 \n\t"
|
||||
"vrepig %%v3,16 \n\t"
|
||||
"vzero %%v4 \n\t"
|
||||
"vleif %%v24,0,0 \n\t"
|
||||
"vleif %%v24,1,1 \n\t"
|
||||
"vleif %%v24,2,2 \n\t"
|
||||
"vleif %%v24,3,3 \n\t"
|
||||
"vleif %%v25,4,0 \n\t"
|
||||
"vleif %%v25,5,1 \n\t"
|
||||
"vleif %%v25,6,2 \n\t"
|
||||
"vleif %%v25,7,3 \n\t"
|
||||
"vleif %%v26,8,0 \n\t"
|
||||
"vleif %%v26,9,1 \n\t"
|
||||
"vleif %%v26,10,2 \n\t"
|
||||
"vleif %%v26,11,3 \n\t"
|
||||
"vleif %%v27,12,0 \n\t"
|
||||
"vleif %%v27,13,1 \n\t"
|
||||
"vleif %%v27,14,2 \n\t"
|
||||
"vleif %%v27,15,3 \n\t"
|
||||
"srlg %%r0,%2,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%3) \n\t"
|
||||
|
||||
"vlef %%v16,0(%%r1,%3),0 \n\t"
|
||||
"vlef %%v17,4(%%r1,%3),0 \n\t"
|
||||
"vlef %%v16,8(%%r1,%3),1 \n\t"
|
||||
"vlef %%v17,12(%%r1,%3),1 \n\t"
|
||||
"vlef %%v16,16(%%r1,%3),2 \n\t"
|
||||
"vlef %%v17,20(%%r1,%3),2 \n\t"
|
||||
"vlef %%v16,24(%%r1,%3),3 \n\t"
|
||||
"vlef %%v17,28(%%r1,%3),3 \n\t"
|
||||
|
||||
"vlef %%v18,32(%%r1,%3),0 \n\t"
|
||||
"vlef %%v19,36(%%r1,%3),0 \n\t"
|
||||
"vlef %%v18,40(%%r1,%3),1 \n\t"
|
||||
"vlef %%v19,44(%%r1,%3),1 \n\t"
|
||||
"vlef %%v18,48(%%r1,%3),2 \n\t"
|
||||
"vlef %%v19,52(%%r1,%3),2 \n\t"
|
||||
"vlef %%v18,56(%%r1,%3),3 \n\t"
|
||||
"vlef %%v19,30(%%r1,%3),3 \n\t"
|
||||
|
||||
"vlef %%v20,64(%%r1,%3),0 \n\t"
|
||||
"vlef %%v21,68(%%r1,%3),0 \n\t"
|
||||
"vlef %%v20,72(%%r1,%3),1 \n\t"
|
||||
"vlef %%v21,76(%%r1,%3),1 \n\t"
|
||||
"vlef %%v20,80(%%r1,%3),2 \n\t"
|
||||
"vlef %%v21,84(%%r1,%3),2 \n\t"
|
||||
"vlef %%v20,88(%%r1,%3),3 \n\t"
|
||||
"vlef %%v21,92(%%r1,%3),3 \n\t"
|
||||
|
||||
"vlef %%v22,96(%%r1,%3),0 \n\t"
|
||||
"vlef %%v23,100(%%r1,%3),0 \n\t"
|
||||
"vlef %%v22,104(%%r1,%3),1 \n\t"
|
||||
"vlef %%v23,108(%%r1,%3),1 \n\t"
|
||||
"vlef %%v22,112(%%r1,%3),2 \n\t"
|
||||
"vlef %%v23,116(%%r1,%3),2 \n\t"
|
||||
"vlef %%v22,120(%%r1,%3),3 \n\t"
|
||||
"vlef %%v23,124(%%r1,%3),3 \n\t"
|
||||
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
"vfasb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfasb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfasb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfasb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchsb %%v5,%%v16,%%v17 \n\t"
|
||||
"vfchsb %%v6,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
|
||||
"vfchsb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchsb %%v7,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
|
||||
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vlef %%v16,128(%%r1,%3),0 \n\t"
|
||||
"vlef %%v17,132(%%r1,%3),0 \n\t"
|
||||
"vlef %%v16,136(%%r1,%3),1 \n\t"
|
||||
"vlef %%v17,140(%%r1,%3),1 \n\t"
|
||||
"vlef %%v16,144(%%r1,%3),2 \n\t"
|
||||
"vlef %%v17,148(%%r1,%3),2 \n\t"
|
||||
"vlef %%v16,152(%%r1,%3),3 \n\t"
|
||||
"vlef %%v17,156(%%r1,%3),3 \n\t"
|
||||
|
||||
"vlef %%v18,160(%%r1,%3),0 \n\t"
|
||||
"vlef %%v19,164(%%r1,%3),0 \n\t"
|
||||
"vlef %%v18,168(%%r1,%3),1 \n\t"
|
||||
"vlef %%v19,172(%%r1,%3),1 \n\t"
|
||||
"vlef %%v18,176(%%r1,%3),2 \n\t"
|
||||
"vlef %%v19,180(%%r1,%3),2 \n\t"
|
||||
"vlef %%v18,184(%%r1,%3),3 \n\t"
|
||||
"vlef %%v19,188(%%r1,%3),3 \n\t"
|
||||
|
||||
"vlef %%v20,192(%%r1,%3),0 \n\t"
|
||||
"vlef %%v21,196(%%r1,%3),0 \n\t"
|
||||
"vlef %%v20,200(%%r1,%3),1 \n\t"
|
||||
"vlef %%v21,204(%%r1,%3),1 \n\t"
|
||||
"vlef %%v20,208(%%r1,%3),2 \n\t"
|
||||
"vlef %%v21,212(%%r1,%3),2 \n\t"
|
||||
"vlef %%v20,216(%%r1,%3),3 \n\t"
|
||||
"vlef %%v21,220(%%r1,%3),3 \n\t"
|
||||
|
||||
"vlef %%v22,224(%%r1,%3),0 \n\t"
|
||||
"vlef %%v23,228(%%r1,%3),0 \n\t"
|
||||
"vlef %%v22,232(%%r1,%3),1 \n\t"
|
||||
"vlef %%v23,236(%%r1,%3),1 \n\t"
|
||||
"vlef %%v22,240(%%r1,%3),2 \n\t"
|
||||
"vlef %%v23,244(%%r1,%3),2 \n\t"
|
||||
"vlef %%v22,248(%%r1,%3),3 \n\t"
|
||||
"vlef %%v23,252(%%r1,%3),3 \n\t"
|
||||
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
"vfasb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfasb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfasb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfasb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchsb %%v5,%%v16,%%v17 \n\t"
|
||||
"vfchsb %%v6,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
|
||||
"vfchsb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchsb %%v7,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
|
||||
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v3,%%v0,32 \n\t"
|
||||
"vfchsb %%v4,%%v0,%%v3 \n\t"
|
||||
"vchlg %%v5,%%v2,%%v1 \n\t"
|
||||
"vfcesb %%v6,%%v0,%%v3 \n\t"
|
||||
"vn %%v5,%%v5,%%v6 \n\t"
|
||||
"vo %%v4,%%v4,%%v5 \n\t"
|
||||
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
|
||||
"vesrlg %%v4,%%v4,32 \n\t"
|
||||
"vsegf %%v4,%%v4 \n\t"
|
||||
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
|
||||
|
||||
"vrepf %%v2,%%v0,2 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcsb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vstef %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchsb %%v4,%%v2,%%v0 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"ste %%f0,%1 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(iamax),"=m"(*amax)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
|
||||
);
|
||||
|
||||
return iamax;
|
||||
}
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0;
|
||||
FLOAT maxf = 0;
|
||||
BLASLONG max = 0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(max);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
max = icamax_kernel_32(n1, x, &maxf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) > maxf )
|
||||
{
|
||||
max = i;
|
||||
maxf = CABS1(x,ix);
|
||||
}
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
return (max + 1);
|
||||
|
||||
} else {
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
maxf = CABS1(x,0);
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) > maxf )
|
||||
{
|
||||
max = i;
|
||||
maxf = CABS1(x,ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
}
|
||||
return (max + 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,319 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
||||
|
||||
static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||
{
|
||||
BLASLONG iamin;
|
||||
|
||||
__asm__ volatile (
|
||||
"vlef %%v0,0(%3),0 \n\t"
|
||||
"vlef %%v1,4(%3),0 \n\t"
|
||||
"vlef %%v0,8(%3),1 \n\t"
|
||||
"vlef %%v1,12(%3),1 \n\t"
|
||||
"vlef %%v0,16(%3),2 \n\t"
|
||||
"vlef %%v1,20(%3),2 \n\t"
|
||||
"vlef %%v0,24(%3),3 \n\t"
|
||||
"vlef %%v1,28(%3),3 \n\t"
|
||||
"vflpsb %%v0,%%v0 \n\t"
|
||||
"vflpsb %%v1,%%v1 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v1 \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,2,1 \n\t"
|
||||
"vleig %%v2,1,0 \n\t"
|
||||
"vleig %%v2,3,1 \n\t"
|
||||
"vrepig %%v3,16 \n\t"
|
||||
"vzero %%v4 \n\t"
|
||||
"vleif %%v24,0,0 \n\t"
|
||||
"vleif %%v24,1,1 \n\t"
|
||||
"vleif %%v24,2,2 \n\t"
|
||||
"vleif %%v24,3,3 \n\t"
|
||||
"vleif %%v25,4,0 \n\t"
|
||||
"vleif %%v25,5,1 \n\t"
|
||||
"vleif %%v25,6,2 \n\t"
|
||||
"vleif %%v25,7,3 \n\t"
|
||||
"vleif %%v26,8,0 \n\t"
|
||||
"vleif %%v26,9,1 \n\t"
|
||||
"vleif %%v26,10,2 \n\t"
|
||||
"vleif %%v26,11,3 \n\t"
|
||||
"vleif %%v27,12,0 \n\t"
|
||||
"vleif %%v27,13,1 \n\t"
|
||||
"vleif %%v27,14,2 \n\t"
|
||||
"vleif %%v27,15,3 \n\t"
|
||||
"srlg %%r0,%2,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
"vlef %%v16,0(%%r1,%3),0 \n\t"
|
||||
"vlef %%v17,4(%%r1,%3),0 \n\t"
|
||||
"vlef %%v16,8(%%r1,%3),1 \n\t"
|
||||
"vlef %%v17,12(%%r1,%3),1 \n\t"
|
||||
"vlef %%v16,16(%%r1,%3),2 \n\t"
|
||||
"vlef %%v17,20(%%r1,%3),2 \n\t"
|
||||
"vlef %%v16,24(%%r1,%3),3 \n\t"
|
||||
"vlef %%v17,28(%%r1,%3),3 \n\t"
|
||||
|
||||
"vlef %%v18,32(%%r1,%3),0 \n\t"
|
||||
"vlef %%v19,36(%%r1,%3),0 \n\t"
|
||||
"vlef %%v18,40(%%r1,%3),1 \n\t"
|
||||
"vlef %%v19,44(%%r1,%3),1 \n\t"
|
||||
"vlef %%v18,48(%%r1,%3),2 \n\t"
|
||||
"vlef %%v19,52(%%r1,%3),2 \n\t"
|
||||
"vlef %%v18,56(%%r1,%3),3 \n\t"
|
||||
"vlef %%v19,30(%%r1,%3),3 \n\t"
|
||||
|
||||
"vlef %%v20,64(%%r1,%3),0 \n\t"
|
||||
"vlef %%v21,68(%%r1,%3),0 \n\t"
|
||||
"vlef %%v20,72(%%r1,%3),1 \n\t"
|
||||
"vlef %%v21,76(%%r1,%3),1 \n\t"
|
||||
"vlef %%v20,80(%%r1,%3),2 \n\t"
|
||||
"vlef %%v21,84(%%r1,%3),2 \n\t"
|
||||
"vlef %%v20,88(%%r1,%3),3 \n\t"
|
||||
"vlef %%v21,92(%%r1,%3),3 \n\t"
|
||||
|
||||
"vlef %%v22,96(%%r1,%3),0 \n\t"
|
||||
"vlef %%v23,100(%%r1,%3),0 \n\t"
|
||||
"vlef %%v22,104(%%r1,%3),1 \n\t"
|
||||
"vlef %%v23,108(%%r1,%3),1 \n\t"
|
||||
"vlef %%v22,112(%%r1,%3),2 \n\t"
|
||||
"vlef %%v23,116(%%r1,%3),2 \n\t"
|
||||
"vlef %%v22,120(%%r1,%3),3 \n\t"
|
||||
"vlef %%v23,124(%%r1,%3),3 \n\t"
|
||||
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
"vfasb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfasb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfasb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfasb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchsb %%v5,%%v17,%%v16 \n\t"
|
||||
"vfchsb %%v6,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
|
||||
"vfchsb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchsb %%v7,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
|
||||
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vlef %%v16,128(%%r1,%3),0 \n\t"
|
||||
"vlef %%v17,132(%%r1,%3),0 \n\t"
|
||||
"vlef %%v16,136(%%r1,%3),1 \n\t"
|
||||
"vlef %%v17,140(%%r1,%3),1 \n\t"
|
||||
"vlef %%v16,144(%%r1,%3),2 \n\t"
|
||||
"vlef %%v17,148(%%r1,%3),2 \n\t"
|
||||
"vlef %%v16,152(%%r1,%3),3 \n\t"
|
||||
"vlef %%v17,156(%%r1,%3),3 \n\t"
|
||||
|
||||
"vlef %%v18,160(%%r1,%3),0 \n\t"
|
||||
"vlef %%v19,164(%%r1,%3),0 \n\t"
|
||||
"vlef %%v18,168(%%r1,%3),1 \n\t"
|
||||
"vlef %%v19,172(%%r1,%3),1 \n\t"
|
||||
"vlef %%v18,176(%%r1,%3),2 \n\t"
|
||||
"vlef %%v19,180(%%r1,%3),2 \n\t"
|
||||
"vlef %%v18,184(%%r1,%3),3 \n\t"
|
||||
"vlef %%v19,188(%%r1,%3),3 \n\t"
|
||||
|
||||
"vlef %%v20,192(%%r1,%3),0 \n\t"
|
||||
"vlef %%v21,196(%%r1,%3),0 \n\t"
|
||||
"vlef %%v20,200(%%r1,%3),1 \n\t"
|
||||
"vlef %%v21,204(%%r1,%3),1 \n\t"
|
||||
"vlef %%v20,208(%%r1,%3),2 \n\t"
|
||||
"vlef %%v21,212(%%r1,%3),2 \n\t"
|
||||
"vlef %%v20,216(%%r1,%3),3 \n\t"
|
||||
"vlef %%v21,220(%%r1,%3),3 \n\t"
|
||||
|
||||
"vlef %%v22,224(%%r1,%3),0 \n\t"
|
||||
"vlef %%v23,228(%%r1,%3),0 \n\t"
|
||||
"vlef %%v22,232(%%r1,%3),1 \n\t"
|
||||
"vlef %%v23,236(%%r1,%3),1 \n\t"
|
||||
"vlef %%v22,240(%%r1,%3),2 \n\t"
|
||||
"vlef %%v23,244(%%r1,%3),2 \n\t"
|
||||
"vlef %%v22,248(%%r1,%3),3 \n\t"
|
||||
"vlef %%v23,252(%%r1,%3),3 \n\t"
|
||||
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
"vfasb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfasb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfasb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfasb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchsb %%v5,%%v17,%%v16 \n\t"
|
||||
"vfchsb %%v6,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
|
||||
"vfchsb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchsb %%v7,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
|
||||
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v3,%%v0,32 \n\t"
|
||||
"vfchsb %%v4,%%v3,%%v0 \n\t"
|
||||
"vchlg %%v5,%%v2,%%v1 \n\t"
|
||||
"vfcesb %%v6,%%v0,%%v3 \n\t"
|
||||
"vn %%v5,%%v5,%%v6 \n\t"
|
||||
"vo %%v4,%%v4,%%v5 \n\t"
|
||||
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
|
||||
"vesrlg %%v4,%%v4,32 \n\t"
|
||||
"vsegf %%v4,%%v4 \n\t"
|
||||
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
|
||||
|
||||
"vrepf %%v2,%%v0,2 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcsb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vstef %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchsb %%v4,%%v0,%%v2 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"ste %%f0,%1 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(iamin),"=m"(*amin)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
|
||||
);
|
||||
|
||||
return iamin;
|
||||
}
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0;
|
||||
FLOAT minf = 0;
|
||||
BLASLONG min = 0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(min);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
min = icamin_kernel_32(n1, x, &minf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) < minf )
|
||||
{
|
||||
min = i;
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
return (min + 1);
|
||||
|
||||
} else {
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
minf = CABS1(x,0);
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) < minf )
|
||||
{
|
||||
min = i;
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
}
|
||||
return (min + 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -23,164 +23,173 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* Find maximum index
|
||||
* Warning: requirements n>0 and n % 32 == 0
|
||||
* @param n
|
||||
* @param x pointer to the vector
|
||||
* @param maxf (out) maximum absolute value .( only for output )
|
||||
* @return index
|
||||
*/
|
||||
static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||
BLASLONG index;
|
||||
__asm__(
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"sllg %%r0,%[n],3 \n\t"
|
||||
"agr %%r0,%[ptr_x] \n\t"
|
||||
"vleig %%v20,0,0 \n\t"
|
||||
"vleig %%v20,1,1 \n\t"
|
||||
"vleig %%v21,2,0 \n\t"
|
||||
"vleig %%v21,3,1 \n\t"
|
||||
"vleig %%v22,4,0 \n\t"
|
||||
"vleig %%v22,5,1 \n\t"
|
||||
"vleig %%v23,6,0 \n\t"
|
||||
"vleig %%v23,7,1 \n\t"
|
||||
"vrepig %%v4,8 \n\t"
|
||||
"vzero %%v5 \n\t"
|
||||
"vzero %%v18 \n\t"
|
||||
"vzero %%v19 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%[ptr_tmp] ) \n\t"
|
||||
"vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
"vfchdb %%v16,%%v25,%%v24 \n\t "
|
||||
"vfchdb %%v17,%%v27,%%v26 \n\t "
|
||||
"vsel %%v1,%%v21,%%v20,%%v16 \n\t"
|
||||
"vsel %%v0,%%v25,%%v24,%%v16 \n\t"
|
||||
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
|
||||
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
|
||||
"vfchdb %%v16,%%v29,%%v28 \n\t "
|
||||
"vfchdb %%v17,%%v31,%%v30 \n\t"
|
||||
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
|
||||
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
|
||||
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
|
||||
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"
|
||||
static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||
{
|
||||
BLASLONG iamax;
|
||||
|
||||
"vfchdb %%v28, %%v3,%%v0 \n\t"
|
||||
"vfchdb %%v29,%%v27, %%v25 \n\t"
|
||||
"vsel %%v1,%%v2,%%v1,%%v28 \n\t"
|
||||
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
|
||||
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
|
||||
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
|
||||
"vag %%v1,%%v1,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v4 \n\t"
|
||||
"vfchdb %%v16,%%v25 , %%v0 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
|
||||
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
|
||||
"vfchdb %%v17, %%v29,%%v18 \n\t"
|
||||
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
|
||||
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
"vfchdb %%v16,%%v25,%%v24 \n\t "
|
||||
"vfchdb %%v17,%%v27,%%v26 \n\t "
|
||||
"vsel %%v1,%%v21,%%v20,%%v16 \n\t"
|
||||
"vsel %%v0,%%v25,%%v24,%%v16 \n\t"
|
||||
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
|
||||
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
|
||||
"vfchdb %%v16,%%v29,%%v28 \n\t "
|
||||
"vfchdb %%v17,%%v31,%%v30 \n\t"
|
||||
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
|
||||
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
|
||||
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
|
||||
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%3) \n\t"
|
||||
"vflpdb %%v0,%%v0 \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,1,1 \n\t"
|
||||
"vrepig %%v2,16 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"vleig %%v24,0,0 \n\t"
|
||||
"vleig %%v24,1,1 \n\t"
|
||||
"vleig %%v25,2,0 \n\t"
|
||||
"vleig %%v25,3,1 \n\t"
|
||||
"vleig %%v26,4,0 \n\t"
|
||||
"vleig %%v26,5,1 \n\t"
|
||||
"vleig %%v27,6,0 \n\t"
|
||||
"vleig %%v27,7,1 \n\t"
|
||||
"vleig %%v28,8,0 \n\t"
|
||||
"vleig %%v28,9,1 \n\t"
|
||||
"vleig %%v29,10,0 \n\t"
|
||||
"vleig %%v29,11,1 \n\t"
|
||||
"vleig %%v30,12,0 \n\t"
|
||||
"vleig %%v30,13,1 \n\t"
|
||||
"vleig %%v31,14,0 \n\t"
|
||||
"vleig %%v31,15,1 \n\t"
|
||||
"srlg %%r0,%2,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
"vfchdb %%v28, %%v3,%%v0 \n\t"
|
||||
"vfchdb %%v29,%%v27, %%v25 \n\t"
|
||||
"vsel %%v1,%%v2,%%v1,%%v28 \n\t"
|
||||
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
|
||||
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
|
||||
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
|
||||
"vag %%v1,%%v1,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v5 \n\t"
|
||||
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
|
||||
"vag %%v24,%%v24,%%v4 \n\t"
|
||||
"vfchdb %%v16,%%v25 , %%v0 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
|
||||
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
|
||||
"vfchdb %%v17, %%v29,%%v18 \n\t"
|
||||
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
|
||||
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v17,16(%%r1,%3) \n\t"
|
||||
"vl %%v18,32(%%r1,%3) \n\t"
|
||||
"vl %%v19,48(%%r1,%3) \n\t"
|
||||
"vl %%v20,64(%%r1,%3) \n\t"
|
||||
"vl %%v21,80(%%r1,%3) \n\t"
|
||||
"vl %%v22,96(%%r1,%3) \n\t"
|
||||
"vl %%v23,112(%%r1,%3) \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchdb %%v4,%%v16,%%v17 \n\t"
|
||||
"vfchdb %%v5,%%v18,%%v19 \n\t"
|
||||
"vfchdb %%v6,%%v20,%%v21 \n\t"
|
||||
"vfchdb %%v7,%%v22,%%v23 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
|
||||
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
|
||||
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
|
||||
|
||||
"vrepg %%v26,%%v18,1 \n\t"
|
||||
"vrepg %%v5,%%v19,1 \n\t"
|
||||
"wfcdb %%v26,%%v18 \n\t"
|
||||
"jne 2f \n\t"
|
||||
"vsteg %%v18,%[maxf],0 \n\t"
|
||||
"vmnlg %%v1,%%v5,%%v19 \n\t"
|
||||
"j 3f \n\t"
|
||||
"vfchdb %%v20,%%v16,%%v17 \n\t"
|
||||
"vfchdb %%v21,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
"wfchdb %%v16,%%v26,%%v18 \n\t"
|
||||
"vsel %%v1,%%v5,%%v19,%%v16 \n\t"
|
||||
"vsel %%v0,%%v26,%%v18,%%v16 \n\t"
|
||||
"std %%f0,%[maxf] \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"vlgvg %[index],%%v1,0 \n\t"
|
||||
: [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x)
|
||||
: [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x)
|
||||
: "cc", "r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return index;
|
||||
"vfchdb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vfchdb %%v5,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
|
||||
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%3) \n\t"
|
||||
"vl %%v17,144(%%r1,%3) \n\t"
|
||||
"vl %%v18,160(%%r1,%3) \n\t"
|
||||
"vl %%v19,176(%%r1,%3) \n\t"
|
||||
"vl %%v20,192(%%r1,%3) \n\t"
|
||||
"vl %%v21,208(%%r1,%3) \n\t"
|
||||
"vl %%v22,224(%%r1,%3) \n\t"
|
||||
"vl %%v23,240(%%r1,%3) \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchdb %%v4,%%v16,%%v17 \n\t"
|
||||
"vfchdb %%v5,%%v18,%%v19 \n\t"
|
||||
"vfchdb %%v6,%%v20,%%v21 \n\t"
|
||||
"vfchdb %%v7,%%v22,%%v23 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
|
||||
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
|
||||
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
|
||||
|
||||
"vfchdb %%v20,%%v16,%%v17 \n\t"
|
||||
"vfchdb %%v21,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
|
||||
|
||||
"vfchdb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vfchdb %%v5,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
|
||||
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v2,%%v0,1 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcdb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vsteg %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchdb %%v4,%%v2,%%v0 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"std %%f0,%1 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(iamax),"=m"(*amax)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return iamax;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
BLASLONG ix = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
BLASLONG max = 0;
|
||||
|
||||
|
|
@ -191,7 +200,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
|||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
max = diamax_kernel_32_TUNED(n1, x, &maxf);
|
||||
max = idamax_kernel_32(n1, x, &maxf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,192 +23,185 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Find minimum index
|
||||
* Warning: requirements n>0 and n % 32 == 0
|
||||
* @param n
|
||||
* @param x pointer to the vector
|
||||
* @param minf (out) minimum absolute value .( only for output )
|
||||
* @return minimum index
|
||||
*/
|
||||
static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||
BLASLONG index;
|
||||
__asm__(
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"sllg %%r0,%[n],3 \n\t"
|
||||
"agr %%r0,%[ptr_x] \n\t"
|
||||
"vleig %%v20,0,0 \n\t"
|
||||
"vleig %%v20,1,1 \n\t"
|
||||
"vleig %%v21,2,0 \n\t"
|
||||
"vleig %%v21,3,1 \n\t"
|
||||
"vleig %%v22,4,0 \n\t"
|
||||
"vleig %%v22,5,1 \n\t"
|
||||
"vleig %%v23,6,0 \n\t"
|
||||
"vleig %%v23,7,1 \n\t"
|
||||
"vrepig %%v4,8 \n\t"
|
||||
"vlrepg %%v18,0(%[ptr_x]) \n\t"
|
||||
"vzero %%v5 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vzero %%v19 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%[ptr_tmp] ) \n\t"
|
||||
"vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t"
|
||||
static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||
{
|
||||
BLASLONG iamin;
|
||||
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%3) \n\t"
|
||||
"vflpdb %%v0,%%v0 \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,1,1 \n\t"
|
||||
"vrepig %%v2,16 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"vleig %%v24,0,0 \n\t"
|
||||
"vleig %%v24,1,1 \n\t"
|
||||
"vleig %%v25,2,0 \n\t"
|
||||
"vleig %%v25,3,1 \n\t"
|
||||
"vleig %%v26,4,0 \n\t"
|
||||
"vleig %%v26,5,1 \n\t"
|
||||
"vleig %%v27,6,0 \n\t"
|
||||
"vleig %%v27,7,1 \n\t"
|
||||
"vleig %%v28,8,0 \n\t"
|
||||
"vleig %%v28,9,1 \n\t"
|
||||
"vleig %%v29,10,0 \n\t"
|
||||
"vleig %%v29,11,1 \n\t"
|
||||
"vleig %%v30,12,0 \n\t"
|
||||
"vleig %%v30,13,1 \n\t"
|
||||
"vleig %%v31,14,0 \n\t"
|
||||
"vleig %%v31,15,1 \n\t"
|
||||
"srlg %%r0,%2,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
"vfchdb %%v16,%%v24,%%v25 \n\t "
|
||||
"vfchdb %%v17,%%v26 ,%%v27 \n\t "
|
||||
"vsel %%v1,%%v21,%%v20,%%v16 \n\t"
|
||||
"vsel %%v0,%%v25,%%v24,%%v16 \n\t"
|
||||
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
|
||||
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
|
||||
"vfchdb %%v16,%%v28, %%v29 \n\t "
|
||||
"vfchdb %%v17,%%v30,%%v31 \n\t"
|
||||
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
|
||||
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
|
||||
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
|
||||
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v17,16(%%r1,%3) \n\t"
|
||||
"vl %%v18,32(%%r1,%3) \n\t"
|
||||
"vl %%v19,48(%%r1,%3) \n\t"
|
||||
"vl %%v20,64(%%r1,%3) \n\t"
|
||||
"vl %%v21,80(%%r1,%3) \n\t"
|
||||
"vl %%v22,96(%%r1,%3) \n\t"
|
||||
"vl %%v23,112(%%r1,%3) \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchdb %%v4,%%v17,%%v16 \n\t"
|
||||
"vfchdb %%v5,%%v19,%%v18 \n\t"
|
||||
"vfchdb %%v6,%%v21,%%v20 \n\t"
|
||||
"vfchdb %%v7,%%v23,%%v22 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
|
||||
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
|
||||
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
|
||||
|
||||
"vfchdb %%v20,%%v17,%%v16 \n\t"
|
||||
"vfchdb %%v21,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
|
||||
|
||||
"vfchdb %%v28,%%v0 , %%v3 \n\t"
|
||||
"vfchdb %%v29, %%v25,%%v27 \n\t"
|
||||
"vsel %%v1,%%v2,%%v1,%%v28 \n\t"
|
||||
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
|
||||
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
|
||||
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
|
||||
"vfchdb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vag %%v1,%%v1,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v4 \n\t"
|
||||
"vfchdb %%v5,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
|
||||
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
"vfchdb %%v16, %%v0,%%v25 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
|
||||
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
|
||||
"vl %%v16,128(%%r1,%3) \n\t"
|
||||
"vl %%v17,144(%%r1,%3) \n\t"
|
||||
"vl %%v18,160(%%r1,%3) \n\t"
|
||||
"vl %%v19,176(%%r1,%3) \n\t"
|
||||
"vl %%v20,192(%%r1,%3) \n\t"
|
||||
"vl %%v21,208(%%r1,%3) \n\t"
|
||||
"vl %%v22,224(%%r1,%3) \n\t"
|
||||
"vl %%v23,240(%%r1,%3) \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchdb %%v17,%%v18, %%v29 \n\t"
|
||||
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
|
||||
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
|
||||
"vfchdb %%v4,%%v17,%%v16 \n\t"
|
||||
"vfchdb %%v5,%%v19,%%v18 \n\t"
|
||||
"vfchdb %%v6,%%v21,%%v20 \n\t"
|
||||
"vfchdb %%v7,%%v23,%%v22 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
|
||||
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
|
||||
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
|
||||
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vfchdb %%v20,%%v17,%%v16 \n\t"
|
||||
"vfchdb %%v21,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
|
||||
|
||||
"vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
"vfchdb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vfchdb %%v16,%%v24,%%v25 \n\t"
|
||||
"vfchdb %%v17,%%v26 ,%%v27 \n\t"
|
||||
"vsel %%v1,%%v21,%%v20,%%v16 \n\t"
|
||||
"vsel %%v0,%%v25,%%v24,%%v16 \n\t"
|
||||
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
|
||||
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
|
||||
"vfchdb %%v16,%%v28 ,%%v29 \n\t"
|
||||
"vfchdb %%v17,%%v30,%%v31 \n\t"
|
||||
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
|
||||
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
|
||||
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
|
||||
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"
|
||||
"vfchdb %%v5,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
|
||||
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vfchdb %%v28,%%v0 , %%v3 \n\t"
|
||||
"vfchdb %%v29, %%v25,%%v27 \n\t"
|
||||
"vsel %%v1,%%v2,%%v1,%%v28 \n\t"
|
||||
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
|
||||
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
|
||||
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
|
||||
|
||||
"vag %%v1,%%v1,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v5 \n\t"
|
||||
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
|
||||
"vag %%v24,%%v24,%%v4 \n\t"
|
||||
|
||||
"vfchdb %%v16, %%v0,%%v25 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
|
||||
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
|
||||
|
||||
"vfchdb %%v17,%%v18, %%v29 \n\t"
|
||||
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
|
||||
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
|
||||
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
|
||||
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
|
||||
|
||||
|
||||
"vrepg %%v26,%%v18,1 \n\t"
|
||||
"vrepg %%v5,%%v19,1 \n\t"
|
||||
"wfcdb %%v26,%%v18 \n\t"
|
||||
"jne 2f \n\t"
|
||||
"vsteg %%v18,%[minf],0 \n\t"
|
||||
"vmnlg %%v1,%%v5,%%v19 \n\t"
|
||||
"j 3f \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
"wfchdb %%v16,%%v18 ,%%v26 \n\t "
|
||||
"vsel %%v1,%%v5,%%v19,%%v16 \n\t"
|
||||
"vsel %%v0,%%v26,%%v18,%%v16 \n\t"
|
||||
"std %%f0,%[minf] \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"vlgvg %[index],%%v1,0 \n\t"
|
||||
|
||||
: [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x)
|
||||
: [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x)
|
||||
: "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
|
||||
);
|
||||
|
||||
return index;
|
||||
"vrepg %%v2,%%v0,1 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcdb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vsteg %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchdb %%v4,%%v0,%%v2 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"std %%f0,%1 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(iamin),"=m"(*amin)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return iamin;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
BLASLONG ix = 0;
|
||||
BLASLONG min = 0;
|
||||
FLOAT minf = 0.0;
|
||||
|
||||
BLASLONG min = 0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (min);
|
||||
minf = ABS(x[0]); //index's not incremented,though it will make first comparision redundant
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
min = diamin_kernel_32(n1, x, &minf);
|
||||
min = idamin_kernel_32(n1, x, &minf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,232 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max)
|
||||
{
|
||||
BLASLONG imax;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%3) \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,1,1 \n\t"
|
||||
"vrepig %%v2,16 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"vleig %%v24,0,0 \n\t"
|
||||
"vleig %%v24,1,1 \n\t"
|
||||
"vleig %%v25,2,0 \n\t"
|
||||
"vleig %%v25,3,1 \n\t"
|
||||
"vleig %%v26,4,0 \n\t"
|
||||
"vleig %%v26,5,1 \n\t"
|
||||
"vleig %%v27,6,0 \n\t"
|
||||
"vleig %%v27,7,1 \n\t"
|
||||
"vleig %%v28,8,0 \n\t"
|
||||
"vleig %%v28,9,1 \n\t"
|
||||
"vleig %%v29,10,0 \n\t"
|
||||
"vleig %%v29,11,1 \n\t"
|
||||
"vleig %%v30,12,0 \n\t"
|
||||
"vleig %%v30,13,1 \n\t"
|
||||
"vleig %%v31,14,0 \n\t"
|
||||
"vleig %%v31,15,1 \n\t"
|
||||
"srlg %%r0,%2,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v17,16(%%r1,%3) \n\t"
|
||||
"vl %%v18,32(%%r1,%3) \n\t"
|
||||
"vl %%v19,48(%%r1,%3) \n\t"
|
||||
"vl %%v20,64(%%r1,%3) \n\t"
|
||||
"vl %%v21,80(%%r1,%3) \n\t"
|
||||
"vl %%v22,96(%%r1,%3) \n\t"
|
||||
"vl %%v23,112(%%r1,%3) \n\t"
|
||||
|
||||
"vfchdb %%v4,%%v16,%%v17 \n\t"
|
||||
"vfchdb %%v5,%%v18,%%v19 \n\t"
|
||||
"vfchdb %%v6,%%v20,%%v21 \n\t"
|
||||
"vfchdb %%v7,%%v22,%%v23 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
|
||||
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
|
||||
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
|
||||
|
||||
"vfchdb %%v20,%%v16,%%v17 \n\t"
|
||||
"vfchdb %%v21,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
|
||||
|
||||
"vfchdb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vfchdb %%v5,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
|
||||
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%3) \n\t"
|
||||
"vl %%v17,144(%%r1,%3) \n\t"
|
||||
"vl %%v18,160(%%r1,%3) \n\t"
|
||||
"vl %%v19,176(%%r1,%3) \n\t"
|
||||
"vl %%v20,192(%%r1,%3) \n\t"
|
||||
"vl %%v21,208(%%r1,%3) \n\t"
|
||||
"vl %%v22,224(%%r1,%3) \n\t"
|
||||
"vl %%v23,240(%%r1,%3) \n\t"
|
||||
|
||||
"vfchdb %%v4,%%v16,%%v17 \n\t"
|
||||
"vfchdb %%v5,%%v18,%%v19 \n\t"
|
||||
"vfchdb %%v6,%%v20,%%v21 \n\t"
|
||||
"vfchdb %%v7,%%v22,%%v23 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
|
||||
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
|
||||
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
|
||||
|
||||
"vfchdb %%v20,%%v16,%%v17 \n\t"
|
||||
"vfchdb %%v21,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
|
||||
|
||||
"vfchdb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vfchdb %%v5,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
|
||||
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v2,%%v0,1 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcdb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vsteg %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchdb %%v4,%%v2,%%v0 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"std %%f0,%1 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(imax),"=m"(*max)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return imax;
|
||||
}
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
BLASLONG max = 0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (max);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
max = idmax_kernel_32(n1, x, &maxf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (x[i] > maxf) {
|
||||
max = i;
|
||||
maxf = x[i];
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (max + 1);
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (x[i] > maxf) {
|
||||
max = j;
|
||||
maxf = x[i];
|
||||
}
|
||||
if (x[i + inc_x] > maxf) {
|
||||
max = j + 1;
|
||||
maxf = x[i + inc_x];
|
||||
}
|
||||
if (x[i + 2 * inc_x] > maxf) {
|
||||
max = j + 2;
|
||||
maxf = x[i + 2 * inc_x];
|
||||
}
|
||||
if (x[i + 3 * inc_x] > maxf) {
|
||||
max = j + 3;
|
||||
maxf = x[i + 3 * inc_x];
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (x[i] > maxf) {
|
||||
max = j;
|
||||
maxf = x[i];
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (max + 1);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,232 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min)
|
||||
{
|
||||
BLASLONG imin;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%3) \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,1,1 \n\t"
|
||||
"vrepig %%v2,16 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"vleig %%v24,0,0 \n\t"
|
||||
"vleig %%v24,1,1 \n\t"
|
||||
"vleig %%v25,2,0 \n\t"
|
||||
"vleig %%v25,3,1 \n\t"
|
||||
"vleig %%v26,4,0 \n\t"
|
||||
"vleig %%v26,5,1 \n\t"
|
||||
"vleig %%v27,6,0 \n\t"
|
||||
"vleig %%v27,7,1 \n\t"
|
||||
"vleig %%v28,8,0 \n\t"
|
||||
"vleig %%v28,9,1 \n\t"
|
||||
"vleig %%v29,10,0 \n\t"
|
||||
"vleig %%v29,11,1 \n\t"
|
||||
"vleig %%v30,12,0 \n\t"
|
||||
"vleig %%v30,13,1 \n\t"
|
||||
"vleig %%v31,14,0 \n\t"
|
||||
"vleig %%v31,15,1 \n\t"
|
||||
"srlg %%r0,%2,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v17,16(%%r1,%3) \n\t"
|
||||
"vl %%v18,32(%%r1,%3) \n\t"
|
||||
"vl %%v19,48(%%r1,%3) \n\t"
|
||||
"vl %%v20,64(%%r1,%3) \n\t"
|
||||
"vl %%v21,80(%%r1,%3) \n\t"
|
||||
"vl %%v22,96(%%r1,%3) \n\t"
|
||||
"vl %%v23,112(%%r1,%3) \n\t"
|
||||
|
||||
"vfchdb %%v4,%%v17,%%v16 \n\t"
|
||||
"vfchdb %%v5,%%v19,%%v18 \n\t"
|
||||
"vfchdb %%v6,%%v21,%%v20 \n\t"
|
||||
"vfchdb %%v7,%%v23,%%v22 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
|
||||
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
|
||||
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
|
||||
|
||||
"vfchdb %%v20,%%v17,%%v16 \n\t"
|
||||
"vfchdb %%v21,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
|
||||
|
||||
"vfchdb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vfchdb %%v5,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
|
||||
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%3) \n\t"
|
||||
"vl %%v17,144(%%r1,%3) \n\t"
|
||||
"vl %%v18,160(%%r1,%3) \n\t"
|
||||
"vl %%v19,176(%%r1,%3) \n\t"
|
||||
"vl %%v20,192(%%r1,%3) \n\t"
|
||||
"vl %%v21,208(%%r1,%3) \n\t"
|
||||
"vl %%v22,224(%%r1,%3) \n\t"
|
||||
"vl %%v23,240(%%r1,%3) \n\t"
|
||||
|
||||
"vfchdb %%v4,%%v17,%%v16 \n\t"
|
||||
"vfchdb %%v5,%%v19,%%v18 \n\t"
|
||||
"vfchdb %%v6,%%v21,%%v20 \n\t"
|
||||
"vfchdb %%v7,%%v23,%%v22 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
|
||||
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
|
||||
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
|
||||
|
||||
"vfchdb %%v20,%%v17,%%v16 \n\t"
|
||||
"vfchdb %%v21,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
|
||||
|
||||
"vfchdb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vfchdb %%v5,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
|
||||
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v2,%%v0,1 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcdb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vsteg %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchdb %%v4,%%v0,%%v2 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"std %%f0,%1 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(imin),"=m"(*min)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return imin;
|
||||
}
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT minf = 0.0;
|
||||
BLASLONG min = 0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (min);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
min = idmin_kernel_32(n1, x, &minf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (x[i] < minf) {
|
||||
min = i;
|
||||
minf = x[i];
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (min + 1);
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (x[i] < minf) {
|
||||
min = j;
|
||||
minf = x[i];
|
||||
}
|
||||
if (x[i + inc_x] < minf) {
|
||||
min = j + 1;
|
||||
minf = x[i + inc_x];
|
||||
}
|
||||
if (x[i + 2 * inc_x] < minf) {
|
||||
min = j + 2;
|
||||
minf = x[i + 2 * inc_x];
|
||||
}
|
||||
if (x[i + 3 * inc_x] < minf) {
|
||||
min = j + 3;
|
||||
minf = x[i + 3 * inc_x];
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (x[i] < minf) {
|
||||
min = j;
|
||||
minf = x[i];
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (min + 1);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,299 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||
{
|
||||
BLASLONG iamax;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%3) \n\t"
|
||||
"vflpsb %%v0,%%v0 \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,2,1 \n\t"
|
||||
"vleig %%v2,1,0 \n\t"
|
||||
"vleig %%v2,3,1 \n\t"
|
||||
"vrepig %%v3,32 \n\t"
|
||||
"vzero %%v4 \n\t"
|
||||
"vleif %%v24,0,0 \n\t"
|
||||
"vleif %%v24,1,1 \n\t"
|
||||
"vleif %%v24,2,2 \n\t"
|
||||
"vleif %%v24,3,3 \n\t"
|
||||
"vleif %%v25,4,0 \n\t"
|
||||
"vleif %%v25,5,1 \n\t"
|
||||
"vleif %%v25,6,2 \n\t"
|
||||
"vleif %%v25,7,3 \n\t"
|
||||
"vleif %%v26,8,0 \n\t"
|
||||
"vleif %%v26,9,1 \n\t"
|
||||
"vleif %%v26,10,2 \n\t"
|
||||
"vleif %%v26,11,3 \n\t"
|
||||
"vleif %%v27,12,0 \n\t"
|
||||
"vleif %%v27,13,1 \n\t"
|
||||
"vleif %%v27,14,2 \n\t"
|
||||
"vleif %%v27,15,3 \n\t"
|
||||
"vleif %%v28,16,0 \n\t"
|
||||
"vleif %%v28,17,1 \n\t"
|
||||
"vleif %%v28,18,2 \n\t"
|
||||
"vleif %%v28,19,3 \n\t"
|
||||
"vleif %%v29,20,0 \n\t"
|
||||
"vleif %%v29,21,1 \n\t"
|
||||
"vleif %%v29,22,2 \n\t"
|
||||
"vleif %%v29,23,3 \n\t"
|
||||
"vleif %%v30,24,0 \n\t"
|
||||
"vleif %%v30,25,1 \n\t"
|
||||
"vleif %%v30,26,2 \n\t"
|
||||
"vleif %%v30,27,3 \n\t"
|
||||
"vleif %%v31,28,0 \n\t"
|
||||
"vleif %%v31,29,1 \n\t"
|
||||
"vleif %%v31,30,2 \n\t"
|
||||
"vleif %%v31,31,3 \n\t"
|
||||
"srlg %%r0,%2,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v17,16(%%r1,%3) \n\t"
|
||||
"vl %%v18,32(%%r1,%3) \n\t"
|
||||
"vl %%v19,48(%%r1,%3) \n\t"
|
||||
"vl %%v20,64(%%r1,%3) \n\t"
|
||||
"vl %%v21,80(%%r1,%3) \n\t"
|
||||
"vl %%v22,96(%%r1,%3) \n\t"
|
||||
"vl %%v23,112(%%r1,%3) \n\t"
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchsb %%v5,%%v16,%%v17 \n\t"
|
||||
"vfchsb %%v6,%%v18,%%v19 \n\t"
|
||||
"vfchsb %%v7,%%v20,%%v21 \n\t"
|
||||
"vfchsb %%v8,%%v22,%%v23 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
|
||||
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
|
||||
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
|
||||
|
||||
"vfchsb %%v20,%%v16,%%v17 \n\t"
|
||||
"vfchsb %%v21,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
|
||||
|
||||
"vfchsb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchsb %%v7,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
|
||||
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%3) \n\t"
|
||||
"vl %%v17,144(%%r1,%3) \n\t"
|
||||
"vl %%v18,160(%%r1,%3) \n\t"
|
||||
"vl %%v19,176(%%r1,%3) \n\t"
|
||||
"vl %%v20,192(%%r1,%3) \n\t"
|
||||
"vl %%v21,208(%%r1,%3) \n\t"
|
||||
"vl %%v22,224(%%r1,%3) \n\t"
|
||||
"vl %%v23,240(%%r1,%3) \n\t"
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchsb %%v5,%%v16,%%v17 \n\t"
|
||||
"vfchsb %%v6,%%v18,%%v19 \n\t"
|
||||
"vfchsb %%v7,%%v20,%%v21 \n\t"
|
||||
"vfchsb %%v8,%%v22,%%v23 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
|
||||
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
|
||||
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
|
||||
|
||||
"vfchsb %%v20,%%v16,%%v17 \n\t"
|
||||
"vfchsb %%v21,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
|
||||
|
||||
"vfchsb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchsb %%v7,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
|
||||
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v3,%%v0,32 \n\t"
|
||||
"vfchsb %%v4,%%v0,%%v3 \n\t"
|
||||
"vchlg %%v5,%%v2,%%v1 \n\t"
|
||||
"vfcesb %%v6,%%v0,%%v3 \n\t"
|
||||
"vn %%v5,%%v5,%%v6 \n\t"
|
||||
"vo %%v4,%%v4,%%v5 \n\t"
|
||||
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
|
||||
"vesrlg %%v4,%%v4,32 \n\t"
|
||||
"vsegf %%v4,%%v4 \n\t"
|
||||
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
|
||||
|
||||
"vrepf %%v2,%%v0,2 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcsb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vstef %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchsb %%v4,%%v2,%%v0 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"ste %%f0,%1 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(iamax),"=m"(*amax)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return iamax;
|
||||
}
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
BLASLONG max = 0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (max);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if (n1 > 0) {
|
||||
|
||||
max = isamax_kernel_64(n1, x, &maxf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) > maxf) {
|
||||
max = i;
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (max + 1);
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (ABS(x[i]) > maxf) {
|
||||
max = j;
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
if (ABS(x[i + inc_x]) > maxf) {
|
||||
max = j + 1;
|
||||
maxf = ABS(x[i + inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 2 * inc_x]) > maxf) {
|
||||
max = j + 2;
|
||||
maxf = ABS(x[i + 2 * inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 3 * inc_x]) > maxf) {
|
||||
max = j + 3;
|
||||
maxf = ABS(x[i + 3 * inc_x]);
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (ABS(x[i]) > maxf) {
|
||||
max = j;
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (max + 1);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,299 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||
{
|
||||
BLASLONG iamin;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%3) \n\t"
|
||||
"vflpsb %%v0,%%v0 \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,2,1 \n\t"
|
||||
"vleig %%v2,1,0 \n\t"
|
||||
"vleig %%v2,3,1 \n\t"
|
||||
"vrepig %%v3,32 \n\t"
|
||||
"vzero %%v4 \n\t"
|
||||
"vleif %%v24,0,0 \n\t"
|
||||
"vleif %%v24,1,1 \n\t"
|
||||
"vleif %%v24,2,2 \n\t"
|
||||
"vleif %%v24,3,3 \n\t"
|
||||
"vleif %%v25,4,0 \n\t"
|
||||
"vleif %%v25,5,1 \n\t"
|
||||
"vleif %%v25,6,2 \n\t"
|
||||
"vleif %%v25,7,3 \n\t"
|
||||
"vleif %%v26,8,0 \n\t"
|
||||
"vleif %%v26,9,1 \n\t"
|
||||
"vleif %%v26,10,2 \n\t"
|
||||
"vleif %%v26,11,3 \n\t"
|
||||
"vleif %%v27,12,0 \n\t"
|
||||
"vleif %%v27,13,1 \n\t"
|
||||
"vleif %%v27,14,2 \n\t"
|
||||
"vleif %%v27,15,3 \n\t"
|
||||
"vleif %%v28,16,0 \n\t"
|
||||
"vleif %%v28,17,1 \n\t"
|
||||
"vleif %%v28,18,2 \n\t"
|
||||
"vleif %%v28,19,3 \n\t"
|
||||
"vleif %%v29,20,0 \n\t"
|
||||
"vleif %%v29,21,1 \n\t"
|
||||
"vleif %%v29,22,2 \n\t"
|
||||
"vleif %%v29,23,3 \n\t"
|
||||
"vleif %%v30,24,0 \n\t"
|
||||
"vleif %%v30,25,1 \n\t"
|
||||
"vleif %%v30,26,2 \n\t"
|
||||
"vleif %%v30,27,3 \n\t"
|
||||
"vleif %%v31,28,0 \n\t"
|
||||
"vleif %%v31,29,1 \n\t"
|
||||
"vleif %%v31,30,2 \n\t"
|
||||
"vleif %%v31,31,3 \n\t"
|
||||
"srlg %%r0,%2,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v17,16(%%r1,%3) \n\t"
|
||||
"vl %%v18,32(%%r1,%3) \n\t"
|
||||
"vl %%v19,48(%%r1,%3) \n\t"
|
||||
"vl %%v20,64(%%r1,%3) \n\t"
|
||||
"vl %%v21,80(%%r1,%3) \n\t"
|
||||
"vl %%v22,96(%%r1,%3) \n\t"
|
||||
"vl %%v23,112(%%r1,%3) \n\t"
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchsb %%v5,%%v17,%%v16 \n\t"
|
||||
"vfchsb %%v6,%%v19,%%v18 \n\t"
|
||||
"vfchsb %%v7,%%v21,%%v20 \n\t"
|
||||
"vfchsb %%v8,%%v23,%%v22 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
|
||||
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
|
||||
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
|
||||
|
||||
"vfchsb %%v20,%%v17,%%v16 \n\t"
|
||||
"vfchsb %%v21,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
|
||||
|
||||
"vfchsb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchsb %%v7,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
|
||||
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%3) \n\t"
|
||||
"vl %%v17,144(%%r1,%3) \n\t"
|
||||
"vl %%v18,160(%%r1,%3) \n\t"
|
||||
"vl %%v19,176(%%r1,%3) \n\t"
|
||||
"vl %%v20,192(%%r1,%3) \n\t"
|
||||
"vl %%v21,208(%%r1,%3) \n\t"
|
||||
"vl %%v22,224(%%r1,%3) \n\t"
|
||||
"vl %%v23,240(%%r1,%3) \n\t"
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchsb %%v5,%%v17,%%v16 \n\t"
|
||||
"vfchsb %%v6,%%v19,%%v18 \n\t"
|
||||
"vfchsb %%v7,%%v21,%%v20 \n\t"
|
||||
"vfchsb %%v8,%%v23,%%v22 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
|
||||
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
|
||||
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
|
||||
|
||||
"vfchsb %%v20,%%v17,%%v16 \n\t"
|
||||
"vfchsb %%v21,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
|
||||
|
||||
"vfchsb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchsb %%v7,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
|
||||
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v3,%%v0,32 \n\t"
|
||||
"vfchsb %%v4,%%v3,%%v0 \n\t"
|
||||
"vchlg %%v5,%%v2,%%v1 \n\t"
|
||||
"vfcesb %%v6,%%v0,%%v3 \n\t"
|
||||
"vn %%v5,%%v5,%%v6 \n\t"
|
||||
"vo %%v4,%%v4,%%v5 \n\t"
|
||||
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
|
||||
"vesrlg %%v4,%%v4,32 \n\t"
|
||||
"vsegf %%v4,%%v4 \n\t"
|
||||
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
|
||||
|
||||
"vrepf %%v2,%%v0,2 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcsb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vstef %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchsb %%v4,%%v0,%%v2 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"ste %%f0,%1 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(iamin),"=m"(*amin)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return iamin;
|
||||
}
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT minf = 0.0;
|
||||
BLASLONG min = 0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (min);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if (n1 > 0) {
|
||||
|
||||
min = isamin_kernel_64(n1, x, &minf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) < minf) {
|
||||
min = i;
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (min + 1);
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (ABS(x[i]) < minf) {
|
||||
min = j;
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
if (ABS(x[i + inc_x]) < minf) {
|
||||
min = j + 1;
|
||||
minf = ABS(x[i + inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 2 * inc_x]) < minf) {
|
||||
min = j + 2;
|
||||
minf = ABS(x[i + 2 * inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 3 * inc_x]) < minf) {
|
||||
min = j + 3;
|
||||
minf = ABS(x[i + 3 * inc_x]);
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (ABS(x[i]) < minf) {
|
||||
min = j;
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (min + 1);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,275 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
|
||||
{
|
||||
BLASLONG imax;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%3) \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,2,1 \n\t"
|
||||
"vleig %%v2,1,0 \n\t"
|
||||
"vleig %%v2,3,1 \n\t"
|
||||
"vrepig %%v3,32 \n\t"
|
||||
"vzero %%v4 \n\t"
|
||||
"vleif %%v24,0,0 \n\t"
|
||||
"vleif %%v24,1,1 \n\t"
|
||||
"vleif %%v24,2,2 \n\t"
|
||||
"vleif %%v24,3,3 \n\t"
|
||||
"vleif %%v25,4,0 \n\t"
|
||||
"vleif %%v25,5,1 \n\t"
|
||||
"vleif %%v25,6,2 \n\t"
|
||||
"vleif %%v25,7,3 \n\t"
|
||||
"vleif %%v26,8,0 \n\t"
|
||||
"vleif %%v26,9,1 \n\t"
|
||||
"vleif %%v26,10,2 \n\t"
|
||||
"vleif %%v26,11,3 \n\t"
|
||||
"vleif %%v27,12,0 \n\t"
|
||||
"vleif %%v27,13,1 \n\t"
|
||||
"vleif %%v27,14,2 \n\t"
|
||||
"vleif %%v27,15,3 \n\t"
|
||||
"vleif %%v28,16,0 \n\t"
|
||||
"vleif %%v28,17,1 \n\t"
|
||||
"vleif %%v28,18,2 \n\t"
|
||||
"vleif %%v28,19,3 \n\t"
|
||||
"vleif %%v29,20,0 \n\t"
|
||||
"vleif %%v29,21,1 \n\t"
|
||||
"vleif %%v29,22,2 \n\t"
|
||||
"vleif %%v29,23,3 \n\t"
|
||||
"vleif %%v30,24,0 \n\t"
|
||||
"vleif %%v30,25,1 \n\t"
|
||||
"vleif %%v30,26,2 \n\t"
|
||||
"vleif %%v30,27,3 \n\t"
|
||||
"vleif %%v31,28,0 \n\t"
|
||||
"vleif %%v31,29,1 \n\t"
|
||||
"vleif %%v31,30,2 \n\t"
|
||||
"vleif %%v31,31,3 \n\t"
|
||||
"srlg %%r0,%2,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v17,16(%%r1,%3) \n\t"
|
||||
"vl %%v18,32(%%r1,%3) \n\t"
|
||||
"vl %%v19,48(%%r1,%3) \n\t"
|
||||
"vl %%v20,64(%%r1,%3) \n\t"
|
||||
"vl %%v21,80(%%r1,%3) \n\t"
|
||||
"vl %%v22,96(%%r1,%3) \n\t"
|
||||
"vl %%v23,112(%%r1,%3) \n\t"
|
||||
|
||||
"vfchsb %%v5,%%v16,%%v17 \n\t"
|
||||
"vfchsb %%v6,%%v18,%%v19 \n\t"
|
||||
"vfchsb %%v7,%%v20,%%v21 \n\t"
|
||||
"vfchsb %%v8,%%v22,%%v23 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
|
||||
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
|
||||
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
|
||||
|
||||
"vfchsb %%v20,%%v16,%%v17 \n\t"
|
||||
"vfchsb %%v21,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
|
||||
|
||||
"vfchsb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchsb %%v7,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
|
||||
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%3) \n\t"
|
||||
"vl %%v17,144(%%r1,%3) \n\t"
|
||||
"vl %%v18,160(%%r1,%3) \n\t"
|
||||
"vl %%v19,176(%%r1,%3) \n\t"
|
||||
"vl %%v20,192(%%r1,%3) \n\t"
|
||||
"vl %%v21,208(%%r1,%3) \n\t"
|
||||
"vl %%v22,224(%%r1,%3) \n\t"
|
||||
"vl %%v23,240(%%r1,%3) \n\t"
|
||||
|
||||
"vfchsb %%v5,%%v16,%%v17 \n\t"
|
||||
"vfchsb %%v6,%%v18,%%v19 \n\t"
|
||||
"vfchsb %%v7,%%v20,%%v21 \n\t"
|
||||
"vfchsb %%v8,%%v22,%%v23 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
|
||||
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
|
||||
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
|
||||
|
||||
"vfchsb %%v20,%%v16,%%v17 \n\t"
|
||||
"vfchsb %%v21,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
|
||||
|
||||
"vfchsb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchsb %%v7,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
|
||||
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v3,%%v0,32 \n\t"
|
||||
"vfchsb %%v4,%%v0,%%v3 \n\t"
|
||||
"vchlg %%v5,%%v2,%%v1 \n\t"
|
||||
"vfcesb %%v6,%%v0,%%v3 \n\t"
|
||||
"vn %%v5,%%v5,%%v6 \n\t"
|
||||
"vo %%v4,%%v4,%%v5 \n\t"
|
||||
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
|
||||
"vesrlg %%v4,%%v4,32 \n\t"
|
||||
"vsegf %%v4,%%v4 \n\t"
|
||||
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
|
||||
|
||||
"vrepf %%v2,%%v0,2 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcsb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vstef %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchsb %%v4,%%v2,%%v0 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"ste %%f0,%1 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(imax),"=m"(*max)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return imax;
|
||||
}
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
BLASLONG max = 0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (max);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if (n1 > 0) {
|
||||
|
||||
max = ismax_kernel_64(n1, x, &maxf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (x[i] > maxf) {
|
||||
max = i;
|
||||
maxf = x[i];
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (max + 1);
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (x[i] > maxf) {
|
||||
max = j;
|
||||
maxf = x[i];
|
||||
}
|
||||
if (x[i + inc_x] > maxf) {
|
||||
max = j + 1;
|
||||
maxf = x[i + inc_x];
|
||||
}
|
||||
if (x[i + 2 * inc_x] > maxf) {
|
||||
max = j + 2;
|
||||
maxf = x[i + 2 * inc_x];
|
||||
}
|
||||
if (x[i + 3 * inc_x] > maxf) {
|
||||
max = j + 3;
|
||||
maxf = x[i + 3 * inc_x];
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (x[i] > maxf) {
|
||||
max = j;
|
||||
maxf = x[i];
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (max + 1);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,275 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
|
||||
{
|
||||
BLASLONG imin;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%3) \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,2,1 \n\t"
|
||||
"vleig %%v2,1,0 \n\t"
|
||||
"vleig %%v2,3,1 \n\t"
|
||||
"vrepig %%v3,32 \n\t"
|
||||
"vzero %%v4 \n\t"
|
||||
"vleif %%v24,0,0 \n\t"
|
||||
"vleif %%v24,1,1 \n\t"
|
||||
"vleif %%v24,2,2 \n\t"
|
||||
"vleif %%v24,3,3 \n\t"
|
||||
"vleif %%v25,4,0 \n\t"
|
||||
"vleif %%v25,5,1 \n\t"
|
||||
"vleif %%v25,6,2 \n\t"
|
||||
"vleif %%v25,7,3 \n\t"
|
||||
"vleif %%v26,8,0 \n\t"
|
||||
"vleif %%v26,9,1 \n\t"
|
||||
"vleif %%v26,10,2 \n\t"
|
||||
"vleif %%v26,11,3 \n\t"
|
||||
"vleif %%v27,12,0 \n\t"
|
||||
"vleif %%v27,13,1 \n\t"
|
||||
"vleif %%v27,14,2 \n\t"
|
||||
"vleif %%v27,15,3 \n\t"
|
||||
"vleif %%v28,16,0 \n\t"
|
||||
"vleif %%v28,17,1 \n\t"
|
||||
"vleif %%v28,18,2 \n\t"
|
||||
"vleif %%v28,19,3 \n\t"
|
||||
"vleif %%v29,20,0 \n\t"
|
||||
"vleif %%v29,21,1 \n\t"
|
||||
"vleif %%v29,22,2 \n\t"
|
||||
"vleif %%v29,23,3 \n\t"
|
||||
"vleif %%v30,24,0 \n\t"
|
||||
"vleif %%v30,25,1 \n\t"
|
||||
"vleif %%v30,26,2 \n\t"
|
||||
"vleif %%v30,27,3 \n\t"
|
||||
"vleif %%v31,28,0 \n\t"
|
||||
"vleif %%v31,29,1 \n\t"
|
||||
"vleif %%v31,30,2 \n\t"
|
||||
"vleif %%v31,31,3 \n\t"
|
||||
"srlg %%r0,%2,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v17,16(%%r1,%3) \n\t"
|
||||
"vl %%v18,32(%%r1,%3) \n\t"
|
||||
"vl %%v19,48(%%r1,%3) \n\t"
|
||||
"vl %%v20,64(%%r1,%3) \n\t"
|
||||
"vl %%v21,80(%%r1,%3) \n\t"
|
||||
"vl %%v22,96(%%r1,%3) \n\t"
|
||||
"vl %%v23,112(%%r1,%3) \n\t"
|
||||
|
||||
"vfchsb %%v5,%%v17,%%v16 \n\t"
|
||||
"vfchsb %%v6,%%v19,%%v18 \n\t"
|
||||
"vfchsb %%v7,%%v21,%%v20 \n\t"
|
||||
"vfchsb %%v8,%%v23,%%v22 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
|
||||
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
|
||||
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
|
||||
|
||||
"vfchsb %%v20,%%v17,%%v16 \n\t"
|
||||
"vfchsb %%v21,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
|
||||
|
||||
"vfchsb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchsb %%v7,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
|
||||
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%3) \n\t"
|
||||
"vl %%v17,144(%%r1,%3) \n\t"
|
||||
"vl %%v18,160(%%r1,%3) \n\t"
|
||||
"vl %%v19,176(%%r1,%3) \n\t"
|
||||
"vl %%v20,192(%%r1,%3) \n\t"
|
||||
"vl %%v21,208(%%r1,%3) \n\t"
|
||||
"vl %%v22,224(%%r1,%3) \n\t"
|
||||
"vl %%v23,240(%%r1,%3) \n\t"
|
||||
|
||||
"vfchsb %%v5,%%v17,%%v16 \n\t"
|
||||
"vfchsb %%v6,%%v19,%%v18 \n\t"
|
||||
"vfchsb %%v7,%%v21,%%v20 \n\t"
|
||||
"vfchsb %%v8,%%v23,%%v22 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
|
||||
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
|
||||
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
|
||||
|
||||
"vfchsb %%v20,%%v17,%%v16 \n\t"
|
||||
"vfchsb %%v21,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
|
||||
|
||||
"vfchsb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchsb %%v7,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
|
||||
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v3,%%v0,32 \n\t"
|
||||
"vfchsb %%v4,%%v3,%%v0 \n\t"
|
||||
"vchlg %%v5,%%v2,%%v1 \n\t"
|
||||
"vfcesb %%v6,%%v0,%%v3 \n\t"
|
||||
"vn %%v5,%%v5,%%v6 \n\t"
|
||||
"vo %%v4,%%v4,%%v5 \n\t"
|
||||
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
|
||||
"vesrlg %%v4,%%v4,32 \n\t"
|
||||
"vsegf %%v4,%%v4 \n\t"
|
||||
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
|
||||
|
||||
"vrepf %%v2,%%v0,2 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcsb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vstef %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchsb %%v4,%%v0,%%v2 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"ste %%f0,%1 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(imin),"=m"(*min)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return imin;
|
||||
}
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT minf = 0.0;
|
||||
BLASLONG min = 0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (min);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if (n1 > 0) {
|
||||
|
||||
min = ismin_kernel_64(n1, x, &minf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (x[i] < minf) {
|
||||
min = i;
|
||||
minf = x[i];
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (min + 1);
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (x[i] < minf) {
|
||||
min = j;
|
||||
minf = x[i];
|
||||
}
|
||||
if (x[i + inc_x] < minf) {
|
||||
min = j + 1;
|
||||
minf = x[i + inc_x];
|
||||
}
|
||||
if (x[i + 2 * inc_x] < minf) {
|
||||
min = j + 2;
|
||||
minf = x[i + 2 * inc_x];
|
||||
}
|
||||
if (x[i + 3 * inc_x] < minf) {
|
||||
min = j + 3;
|
||||
minf = x[i + 3 * inc_x];
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (x[i] < minf) {
|
||||
min = j;
|
||||
minf = x[i];
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (min + 1);
|
||||
}
|
||||
}
|
||||
|
|
@ -24,190 +24,165 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#define ABS fabs
|
||||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
||||
|
||||
static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||
{
|
||||
BLASLONG iamax;
|
||||
|
||||
__asm__ volatile (
|
||||
"vleg %%v0,0(%3),0 \n\t"
|
||||
"vleg %%v1,8(%3),0 \n\t"
|
||||
"vleg %%v0,16(%3),1 \n\t"
|
||||
"vleg %%v1,24(%3),1 \n\t"
|
||||
"vflpdb %%v0,%%v0 \n\t"
|
||||
"vflpdb %%v1,%%v1 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v1 \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,1,1 \n\t"
|
||||
"vrepig %%v2,8 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"vleig %%v24,0,0 \n\t"
|
||||
"vleig %%v24,1,1 \n\t"
|
||||
"vleig %%v25,2,0 \n\t"
|
||||
"vleig %%v25,3,1 \n\t"
|
||||
"vleig %%v26,4,0 \n\t"
|
||||
"vleig %%v26,5,1 \n\t"
|
||||
"vleig %%v27,6,0 \n\t"
|
||||
"vleig %%v27,7,1 \n\t"
|
||||
"srlg %%r0,%2,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
|
||||
/**
|
||||
* Find maximum index
|
||||
* Warning: requirements n>0 and n % 16 == 0
|
||||
* @param n
|
||||
* @param x pointer to the vector
|
||||
* @param maxf (out) maximum absolute value .( only for output )
|
||||
* @return index
|
||||
*/
|
||||
static BLASLONG ziamax_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||
BLASLONG index;
|
||||
__asm__(
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"vleig %%v16,0,0 \n\t"
|
||||
"vleig %%v16,1,1 \n\t"
|
||||
"vleig %%v17,2,0 \n\t"
|
||||
"vleig %%v17,3,1 \n\t"
|
||||
"vleig %%v18,4,0 \n\t"
|
||||
"vleig %%v18,5,1 \n\t"
|
||||
"vleig %%v19,6,0 \n\t"
|
||||
"vleig %%v19,7,1 \n\t"
|
||||
"vleig %%v20,8,0 \n\t"
|
||||
"vleig %%v20,9,1 \n\t"
|
||||
"vleig %%v21,10,0 \n\t"
|
||||
"vleig %%v21,11,1 \n\t"
|
||||
"vleig %%v22,12,0 \n\t"
|
||||
"vleig %%v22,13,1 \n\t"
|
||||
"vleig %%v23,14,0 \n\t"
|
||||
"vleig %%v23,15,1 \n\t"
|
||||
|
||||
|
||||
"sllg %%r0,%[n],4 \n\t"
|
||||
"agr %%r0,%[ptr_x] \n\t"
|
||||
"vzero %%v6 \n\t"
|
||||
"vzero %%v7 \n\t"
|
||||
"vrepig %%v4,16 \n\t"
|
||||
"vzero %%v5 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%[ptr_tmp] ) \n\t"
|
||||
"vleg %%v16,0(%%r1,%3),0 \n\t"
|
||||
"vleg %%v17,8(%%r1,%3),0 \n\t"
|
||||
"vleg %%v16,16(%%r1,%3),1 \n\t"
|
||||
"vleg %%v17,24(%%r1,%3),1 \n\t"
|
||||
"vleg %%v18,32(%%r1,%3),0 \n\t"
|
||||
"vleg %%v19,40(%%r1,%3),0 \n\t"
|
||||
"vleg %%v18,48(%%r1,%3),1 \n\t"
|
||||
"vleg %%v19,56(%%r1,%3),1 \n\t"
|
||||
"vleg %%v20,64(%%r1,%3),0 \n\t"
|
||||
"vleg %%v21,72(%%r1,%3),0 \n\t"
|
||||
"vleg %%v20,80(%%r1,%3),1 \n\t"
|
||||
"vleg %%v21,88(%%r1,%3),1 \n\t"
|
||||
"vleg %%v22,96(%%r1,%3),0 \n\t"
|
||||
"vleg %%v23,104(%%r1,%3),0 \n\t"
|
||||
"vleg %%v22,112(%%r1,%3),1 \n\t"
|
||||
"vleg %%v23,120(%%r1,%3),1 \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
"vfadb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfadb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfadb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfadb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vleg %%v24 , 0(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v25 , 8(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v24 , 16(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v25 , 24(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v26 , 32(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v27 , 40(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v26 , 48(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v27 , 56(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v28 , 64(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v29 , 72(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v28 , 80(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v29 , 88(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v30 , 96(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v31 ,104(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v30 ,112(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v31 ,120(%[ptr_tmp]),1 \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
|
||||
"vfadb %%v0,%%v24,%%v25 \n\t"
|
||||
"vfadb %%v1,%%v26,%%v27 \n\t"
|
||||
"vfadb %%v2,%%v28,%%v29 \n\t"
|
||||
"vfadb %%v3,%%v30,%%v31 \n\t"
|
||||
|
||||
|
||||
"vleg %%v24 , 128(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v25 , 136(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v24 , 144(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v25 , 152(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v26 , 160(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v27 , 168(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v26 , 176(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v27 , 184(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v28 , 192(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v29 , 200(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v28 , 208(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v29 , 216(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v30 , 224(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v31 , 232(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v30 , 240(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v31 , 248(%[ptr_tmp]),1 \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
|
||||
"vfadb %%v24,%%v24,%%v25 \n\t"
|
||||
"vfadb %%v26,%%v26,%%v27 \n\t"
|
||||
"vfadb %%v28,%%v28,%%v29 \n\t"
|
||||
"vfadb %%v30,%%v30,%%v31 \n\t"
|
||||
|
||||
"vfchdb %%v25,%%v1,%%v0 \n\t"
|
||||
"vsel %%v29,%%v17,%%v16,%%v25 \n\t"
|
||||
"vsel %%v31,%%v1,%%v0,%%v25 \n\t"
|
||||
|
||||
"vfchdb %%v27,%%v3,%%v2 \n\t "
|
||||
"vsel %%v0,%%v19,%%v18,%%v27 \n\t"
|
||||
"vsel %%v1,%%v3,%%v2,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v25,%%v26,%%v24 \n\t"
|
||||
"vsel %%v2,%%v21,%%v20,%%v25 \n\t"
|
||||
"vsel %%v3,%%v26,%%v24,%%v25 \n\t"
|
||||
|
||||
"vfchdb %%v27,%%v30,%%v28 \n\t"
|
||||
"vsel %%v25,%%v23,%%v22,%%v27 \n\t"
|
||||
"vsel %%v27,%%v30,%%v28,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v24, %%v1,%%v31 \n\t"
|
||||
"vsel %%v26,%%v0,%%v29,%%v24 \n\t"
|
||||
"vsel %%v28,%%v1,%%v31,%%v24 \n\t"
|
||||
|
||||
"vfchdb %%v30, %%v27,%%v3 \n\t"
|
||||
"vsel %%v29,%%v25,%%v2,%%v30 \n\t"
|
||||
"vsel %%v31,%%v27,%%v3 ,%%v30 \n\t"
|
||||
|
||||
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
|
||||
|
||||
"vfchdb %%v0, %%v31,%%v28 \n\t"
|
||||
"vsel %%v25,%%v29,%%v26,%%v0 \n\t"
|
||||
"vsel %%v27,%%v31,%%v28,%%v0 \n\t"
|
||||
|
||||
"vag %%v25,%%v25,%%v5 \n\t"
|
||||
|
||||
//cmp with previous
|
||||
"vfchdb %%v30, %%v27,%%v6 \n\t"
|
||||
"vsel %%v7,%%v25,%%v7,%%v30 \n\t"
|
||||
"vsel %%v6,%%v27,%%v6,%%v30 \n\t"
|
||||
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
|
||||
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
|
||||
"vfchdb %%v4,%%v16,%%v17 \n\t"
|
||||
"vfchdb %%v5,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
|
||||
//xtract index
|
||||
"vrepg %%v26,%%v6,1 \n\t"
|
||||
"vrepg %%v5,%%v7,1 \n\t"
|
||||
"wfcdb %%v26,%%v6 \n\t"
|
||||
"jne 2f \n\t"
|
||||
"vsteg %%v6,%[maxf],0 \n\t"
|
||||
"vmnlg %%v1,%%v5,%%v7 \n\t"
|
||||
"vlgvg %[index],%%v1,0 \n\t"
|
||||
"j 3 \n\t"
|
||||
"2: \n\t"
|
||||
"wfchdb %%v16,%%v26,%%v6 \n\t"
|
||||
"vsel %%v1,%%v5,%%v7,%%v16 \n\t"
|
||||
"vsel %%v0,%%v26,%%v6,%%v16 \n\t"
|
||||
"vlgvg %[index],%%v1,0 \n\t"
|
||||
"std %%f0,%[maxf] \n\t"
|
||||
"3: \n\t"
|
||||
: [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x)
|
||||
: [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x)
|
||||
: "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
"vfchdb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
);
|
||||
return index;
|
||||
"vfchdb %%v5,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
|
||||
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
"vleg %%v16,128(%%r1,%3),0 \n\t"
|
||||
"vleg %%v17,136(%%r1,%3),0 \n\t"
|
||||
"vleg %%v16,144(%%r1,%3),1 \n\t"
|
||||
"vleg %%v17,152(%%r1,%3),1 \n\t"
|
||||
"vleg %%v18,160(%%r1,%3),0 \n\t"
|
||||
"vleg %%v19,168(%%r1,%3),0 \n\t"
|
||||
"vleg %%v18,176(%%r1,%3),1 \n\t"
|
||||
"vleg %%v19,184(%%r1,%3),1 \n\t"
|
||||
"vleg %%v20,192(%%r1,%3),0 \n\t"
|
||||
"vleg %%v21,200(%%r1,%3),0 \n\t"
|
||||
"vleg %%v20,208(%%r1,%3),1 \n\t"
|
||||
"vleg %%v21,216(%%r1,%3),1 \n\t"
|
||||
"vleg %%v22,224(%%r1,%3),0 \n\t"
|
||||
"vleg %%v23,232(%%r1,%3),0 \n\t"
|
||||
"vleg %%v22,240(%%r1,%3),1 \n\t"
|
||||
"vleg %%v23,248(%%r1,%3),1 \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
"vfadb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfadb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfadb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfadb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchdb %%v4,%%v16,%%v17 \n\t"
|
||||
"vfchdb %%v5,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
|
||||
"vfchdb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vfchdb %%v5,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
|
||||
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v2,%%v0,1 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcdb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vsteg %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchdb %%v4,%%v2,%%v0 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"std %%f0,%1 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(iamax),"=m"(*amax)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
|
||||
);
|
||||
|
||||
return iamax;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
|
|
@ -223,9 +198,9 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
max = ziamax_kernel_16_TUNED(n1, x, &maxf);
|
||||
max = izamax_kernel_16(n1, x, &maxf);
|
||||
|
||||
i = n1;
|
||||
ix = n1 << 1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
|
|
@ -260,7 +235,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
}
|
||||
return (max + 1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -24,253 +24,217 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#define ABS fabs
|
||||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
||||
|
||||
|
||||
/**
|
||||
* Find minimum index
|
||||
* Warning: requirements n>0 and n % 16 == 0
|
||||
* @param n
|
||||
* @param x pointer to the vector
|
||||
* @param minf (out) minimum absolute value .( only for output )
|
||||
* @return minimum index
|
||||
*/
|
||||
static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||
BLASLONG index ;
|
||||
__asm__(
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"vleig %%v16,0,0 \n\t"
|
||||
"vleig %%v16,1,1 \n\t"
|
||||
"vleig %%v17,2,0 \n\t"
|
||||
"vleig %%v17,3,1 \n\t"
|
||||
"vleig %%v18,4,0 \n\t"
|
||||
"vleig %%v18,5,1 \n\t"
|
||||
"vleig %%v19,6,0 \n\t"
|
||||
"vleig %%v19,7,1 \n\t"
|
||||
"vleig %%v20,8,0 \n\t"
|
||||
"vleig %%v20,9,1 \n\t"
|
||||
"vleig %%v21,10,0 \n\t"
|
||||
"vleig %%v21,11,1 \n\t"
|
||||
"vleig %%v22,12,0 \n\t"
|
||||
"vleig %%v22,13,1 \n\t"
|
||||
"vleig %%v23,14,0 \n\t"
|
||||
"vleig %%v23,15,1 \n\t"
|
||||
"ld %%f6,0(%[ptr_x]) \n\t"
|
||||
"lpdbr %%f6,%%f6 \n\t"
|
||||
"ld %%f7,8(%[ptr_x]) \n\t"
|
||||
"lpdbr %%f7,%%f7 \n\t"
|
||||
"adbr %%f6,%%f7 \n\t"
|
||||
"sllg %%r0,%[n],4 \n\t"
|
||||
"agr %%r0,%[ptr_x] \n\t"
|
||||
"vrepg %%v6,%%v6,0 \n\t"
|
||||
"vzero %%v7 \n\t"
|
||||
"vrepig %%v4,16 \n\t"
|
||||
"vzero %%v5 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%[ptr_tmp] ) \n\t"
|
||||
static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||
{
|
||||
BLASLONG iamin;
|
||||
|
||||
__asm__ volatile (
|
||||
"vleg %%v0,0(%3),0 \n\t"
|
||||
"vleg %%v1,8(%3),0 \n\t"
|
||||
"vleg %%v0,16(%3),1 \n\t"
|
||||
"vleg %%v1,24(%3),1 \n\t"
|
||||
"vflpdb %%v0,%%v0 \n\t"
|
||||
"vflpdb %%v1,%%v1 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v1 \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,1,1 \n\t"
|
||||
"vrepig %%v2,8 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"vleig %%v24,0,0 \n\t"
|
||||
"vleig %%v24,1,1 \n\t"
|
||||
"vleig %%v25,2,0 \n\t"
|
||||
"vleig %%v25,3,1 \n\t"
|
||||
"vleig %%v26,4,0 \n\t"
|
||||
"vleig %%v26,5,1 \n\t"
|
||||
"vleig %%v27,6,0 \n\t"
|
||||
"vleig %%v27,7,1 \n\t"
|
||||
"srlg %%r0,%2,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
"vleg %%v16,0(%%r1,%3),0 \n\t"
|
||||
"vleg %%v17,8(%%r1,%3),0 \n\t"
|
||||
"vleg %%v16,16(%%r1,%3),1 \n\t"
|
||||
"vleg %%v17,24(%%r1,%3),1 \n\t"
|
||||
"vleg %%v18,32(%%r1,%3),0 \n\t"
|
||||
"vleg %%v19,40(%%r1,%3),0 \n\t"
|
||||
"vleg %%v18,48(%%r1,%3),1 \n\t"
|
||||
"vleg %%v19,56(%%r1,%3),1 \n\t"
|
||||
"vleg %%v20,64(%%r1,%3),0 \n\t"
|
||||
"vleg %%v21,72(%%r1,%3),0 \n\t"
|
||||
"vleg %%v20,80(%%r1,%3),1 \n\t"
|
||||
"vleg %%v21,88(%%r1,%3),1 \n\t"
|
||||
"vleg %%v22,96(%%r1,%3),0 \n\t"
|
||||
"vleg %%v23,104(%%r1,%3),0 \n\t"
|
||||
"vleg %%v22,112(%%r1,%3),1 \n\t"
|
||||
"vleg %%v23,120(%%r1,%3),1 \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
"vfadb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfadb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfadb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfadb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vleg %%v24 , 0(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v25 , 8(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v24 , 16(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v25 , 24(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v26 , 32(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v27 , 40(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v26 , 48(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v27 , 56(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v28 , 64(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v29 , 72(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v28 , 80(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v29 , 88(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v30 , 96(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v31 ,104(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v30 ,112(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v31 ,120(%[ptr_tmp]),1 \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
|
||||
"vfadb %%v0,%%v24,%%v25 \n\t"
|
||||
"vfadb %%v1,%%v26,%%v27 \n\t"
|
||||
"vfadb %%v2,%%v28,%%v29 \n\t"
|
||||
"vfadb %%v3,%%v30,%%v31 \n\t"
|
||||
|
||||
|
||||
"vleg %%v24 ,128(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v25 ,136(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v24 ,144(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v25 ,152(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v26 ,160(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v27 ,168(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v26 ,176(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v27 ,184(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v28 ,192(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v29 ,200(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v28 ,208(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v29 ,216(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v30 ,224(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v31 ,232(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v30 ,240(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v31 ,248(%[ptr_tmp]),1 \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
|
||||
"vfadb %%v24,%%v24,%%v25 \n\t"
|
||||
"vfadb %%v26,%%v26,%%v27 \n\t"
|
||||
"vfadb %%v28,%%v28,%%v29 \n\t"
|
||||
"vfadb %%v30,%%v30,%%v31 \n\t"
|
||||
|
||||
|
||||
"vfchdb %%v25,%%v0 ,%%v1 \n\t"
|
||||
"vsel %%v29,%%v17,%%v16,%%v25 \n\t"
|
||||
"vsel %%v31,%%v1,%%v0,%%v25 \n\t"
|
||||
|
||||
"vfchdb %%v27,%%v2,%%v3 \n\t"
|
||||
"vsel %%v0,%%v19,%%v18,%%v27 \n\t"
|
||||
"vsel %%v1,%%v3,%%v2,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v25,%%v24,%%v26 \n\t"
|
||||
"vsel %%v2,%%v21,%%v20,%%v25 \n\t"
|
||||
"vsel %%v3,%%v26,%%v24,%%v25 \n\t"
|
||||
|
||||
"vfchdb %%v27,%%v28,%%v30 \n\t"
|
||||
"vsel %%v25,%%v23,%%v22,%%v27 \n\t"
|
||||
"vsel %%v27,%%v30,%%v28,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v31, %%v1 \n\t"
|
||||
"vsel %%v26,%%v0,%%v29,%%v24 \n\t"
|
||||
"vsel %%v28,%%v1,%%v31,%%v24 \n\t"
|
||||
|
||||
"vfchdb %%v30,%%v3, %%v27 \n\t"
|
||||
"vsel %%v29,%%v25,%%v2,%%v30 \n\t"
|
||||
"vsel %%v31,%%v27,%%v3 ,%%v30 \n\t"
|
||||
|
||||
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
|
||||
|
||||
"vfchdb %%v0,%%v28, %%v31 \n\t"
|
||||
"vsel %%v25,%%v29,%%v26,%%v0 \n\t"
|
||||
"vsel %%v27,%%v31,%%v28,%%v0 \n\t"
|
||||
|
||||
"vag %%v25,%%v25,%%v5 \n\t"
|
||||
|
||||
//cmp with previous
|
||||
"vfchdb %%v30,%%v6 , %%v27 \n\t"
|
||||
"vsel %%v7,%%v25,%%v7,%%v30 \n\t"
|
||||
"vsel %%v6,%%v27,%%v6,%%v30 \n\t"
|
||||
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
|
||||
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
|
||||
"vfchdb %%v4,%%v17,%%v16 \n\t"
|
||||
"vfchdb %%v5,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
|
||||
//xtract index
|
||||
"vrepg %%v26,%%v6,1 \n\t"
|
||||
"vrepg %%v5,%%v7,1 \n\t"
|
||||
"wfcdb %%v26,%%v6 \n\t"
|
||||
"jne 2f \n\t"
|
||||
"vsteg %%v6,%[minf],0 \n\t"
|
||||
"vmnlg %%v1,%%v5,%%v7 \n\t"
|
||||
"vlgvg %[index],%%v1,0 \n\t"
|
||||
"j 3f \n\t"
|
||||
"2: \n\t"
|
||||
"wfchdb %%v16,%%v6 ,%%v26 \n\t"
|
||||
"vsel %%v1,%%v5,%%v7,%%v16 \n\t"
|
||||
"vsel %%v0,%%v26,%%v6,%%v16 \n\t"
|
||||
"vlgvg %[index],%%v1,0 \n\t"
|
||||
"std %%f0,%[minf] \n\t"
|
||||
"3: \n\t"
|
||||
"vfchdb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
: [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x)
|
||||
: [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x)
|
||||
: "cc","r0","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
"vfchdb %%v5,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
|
||||
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
);
|
||||
"vleg %%v16,128(%%r1,%3),0 \n\t"
|
||||
"vleg %%v17,136(%%r1,%3),0 \n\t"
|
||||
"vleg %%v16,144(%%r1,%3),1 \n\t"
|
||||
"vleg %%v17,152(%%r1,%3),1 \n\t"
|
||||
"vleg %%v18,160(%%r1,%3),0 \n\t"
|
||||
"vleg %%v19,168(%%r1,%3),0 \n\t"
|
||||
"vleg %%v18,176(%%r1,%3),1 \n\t"
|
||||
"vleg %%v19,184(%%r1,%3),1 \n\t"
|
||||
"vleg %%v20,192(%%r1,%3),0 \n\t"
|
||||
"vleg %%v21,200(%%r1,%3),0 \n\t"
|
||||
"vleg %%v20,208(%%r1,%3),1 \n\t"
|
||||
"vleg %%v21,216(%%r1,%3),1 \n\t"
|
||||
"vleg %%v22,224(%%r1,%3),0 \n\t"
|
||||
"vleg %%v23,232(%%r1,%3),0 \n\t"
|
||||
"vleg %%v22,240(%%r1,%3),1 \n\t"
|
||||
"vleg %%v23,248(%%r1,%3),1 \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
"vfadb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfadb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfadb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfadb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchdb %%v4,%%v17,%%v16 \n\t"
|
||||
"vfchdb %%v5,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
|
||||
return index;
|
||||
"vfchdb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vfchdb %%v5,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
|
||||
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v2,%%v0,1 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcdb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vsteg %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchdb %%v4,%%v0,%%v2 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"std %%f0,%1 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(iamin),"=m"(*amin)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
|
||||
);
|
||||
|
||||
return iamin;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT minf;
|
||||
BLASLONG min=0;
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0;
|
||||
FLOAT minf = 0;
|
||||
BLASLONG min = 0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(min);
|
||||
|
||||
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
min = izamin_kernel_16(n1, x, &minf);
|
||||
|
||||
min = ziamin_kernel_16_TUNED(n1, x, &minf);
|
||||
i = n1;
|
||||
ix = n1 << 1;
|
||||
}
|
||||
else {
|
||||
//assign minf
|
||||
minf = CABS1(x,0);
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) < minf )
|
||||
{
|
||||
if( CABS1(x,ix) < minf )
|
||||
{
|
||||
min = i;
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
ix += 2;
|
||||
i++;
|
||||
min = i;
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
return (min + 1);
|
||||
|
||||
} else {
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
minf = CABS1(x,0);
|
||||
minf = CABS1(x,0);
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) < minf )
|
||||
{
|
||||
min = i;
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) < minf )
|
||||
{
|
||||
min = i;
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
return (min + 1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,210 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT amax;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%2) \n\t"
|
||||
"vflpsb %%v0,%%v0 \n\t"
|
||||
"srlg %%r0,%1,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchsb %%v24,%%v16,%%v17 \n\t"
|
||||
"vfchsb %%v25,%%v18,%%v19 \n\t"
|
||||
"vfchsb %%v26,%%v20,%%v21 \n\t"
|
||||
"vfchsb %%v27,%%v22,%%v23 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchsb %%v28,%%v24,%%v25 \n\t"
|
||||
"vfchsb %%v29,%%v26,%%v27 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchsb %%v30,%%v28,%%v29 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchsb %%v31,%%v30,%%v0 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%2) \n\t"
|
||||
"vl %%v17,144(%%r1,%2) \n\t"
|
||||
"vl %%v18,160(%%r1,%2) \n\t"
|
||||
"vl %%v19,176(%%r1,%2) \n\t"
|
||||
"vl %%v20,192(%%r1,%2) \n\t"
|
||||
"vl %%v21,208(%%r1,%2) \n\t"
|
||||
"vl %%v22,224(%%r1,%2) \n\t"
|
||||
"vl %%v23,240(%%r1,%2) \n\t"
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchsb %%v24,%%v16,%%v17 \n\t"
|
||||
"vfchsb %%v25,%%v18,%%v19 \n\t"
|
||||
"vfchsb %%v26,%%v20,%%v21 \n\t"
|
||||
"vfchsb %%v27,%%v22,%%v23 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchsb %%v28,%%v24,%%v25 \n\t"
|
||||
"vfchsb %%v29,%%v26,%%v27 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchsb %%v30,%%v28,%%v29 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchsb %%v31,%%v30,%%v0 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v16,%%v0,32 \n\t"
|
||||
"vfchsb %%v17,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
|
||||
|
||||
"vrepf %%v16,%%v0,2 \n\t"
|
||||
"wfchsb %%v17,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
|
||||
"ler %0,%%f0 "
|
||||
:"=f"(amax)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return amax;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if (n1 > 0) {
|
||||
|
||||
maxf = samax_kernel_64(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
maxf=ABS(x[0]);
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) > maxf) {
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (maxf);
|
||||
|
||||
} else {
|
||||
|
||||
maxf=ABS(x[0]);
|
||||
i += inc_x;
|
||||
j++;
|
||||
|
||||
BLASLONG n1 = (n - 1) & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (ABS(x[i]) > maxf) {
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
if (ABS(x[i + inc_x]) > maxf) {
|
||||
maxf = ABS(x[i + inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 2 * inc_x]) > maxf) {
|
||||
maxf = ABS(x[i + 2 * inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 3 * inc_x]) > maxf) {
|
||||
maxf = ABS(x[i + 3 * inc_x]);
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (ABS(x[i]) > maxf) {
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (maxf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,210 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT amin;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%2) \n\t"
|
||||
"vflpsb %%v0,%%v0 \n\t"
|
||||
"srlg %%r0,%1,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchsb %%v24,%%v17,%%v16 \n\t"
|
||||
"vfchsb %%v25,%%v19,%%v18 \n\t"
|
||||
"vfchsb %%v26,%%v21,%%v20 \n\t"
|
||||
"vfchsb %%v27,%%v23,%%v22 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchsb %%v28,%%v25,%%v24 \n\t"
|
||||
"vfchsb %%v29,%%v27,%%v26 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchsb %%v30,%%v29,%%v28 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchsb %%v31,%%v0,%%v30 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%2) \n\t"
|
||||
"vl %%v17,144(%%r1,%2) \n\t"
|
||||
"vl %%v18,160(%%r1,%2) \n\t"
|
||||
"vl %%v19,176(%%r1,%2) \n\t"
|
||||
"vl %%v20,192(%%r1,%2) \n\t"
|
||||
"vl %%v21,208(%%r1,%2) \n\t"
|
||||
"vl %%v22,224(%%r1,%2) \n\t"
|
||||
"vl %%v23,240(%%r1,%2) \n\t"
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchsb %%v24,%%v17,%%v16 \n\t"
|
||||
"vfchsb %%v25,%%v19,%%v18 \n\t"
|
||||
"vfchsb %%v26,%%v21,%%v20 \n\t"
|
||||
"vfchsb %%v27,%%v23,%%v22 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchsb %%v28,%%v25,%%v24 \n\t"
|
||||
"vfchsb %%v29,%%v27,%%v26 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchsb %%v30,%%v29,%%v28 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchsb %%v31,%%v0,%%v30 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v16,%%v0,32 \n\t"
|
||||
"vfchsb %%v17,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
|
||||
|
||||
"vrepf %%v16,%%v0,2 \n\t"
|
||||
"wfchsb %%v17,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
|
||||
"ler %0,%%f0 "
|
||||
:"=f"(amin)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return amin;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT minf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (minf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if (n1 > 0) {
|
||||
|
||||
minf = samin_kernel_64(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
minf=ABS(x[0]);
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) < minf) {
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (minf);
|
||||
|
||||
} else {
|
||||
|
||||
minf=ABS(x[0]);
|
||||
i += inc_x;
|
||||
j++;
|
||||
|
||||
BLASLONG n1 = (n - 1) & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (ABS(x[i]) < minf) {
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
if (ABS(x[i + inc_x]) < minf) {
|
||||
minf = ABS(x[i + inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 2 * inc_x]) < minf) {
|
||||
minf = ABS(x[i + 2 * inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 3 * inc_x]) < minf) {
|
||||
minf = ABS(x[i + 3 * inc_x]);
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (ABS(x[i]) < minf) {
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (minf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,174 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT asum;
|
||||
|
||||
__asm__ (
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"vzero %%v2 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"srlg %%r0,%1,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
||||
"vl %%v20, 64(%%r1,%2) \n\t"
|
||||
"vl %%v21, 80(%%r1,%2) \n\t"
|
||||
"vl %%v22, 96(%%r1,%2) \n\t"
|
||||
"vl %%v23, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfasb %%v0,%%v0,%%v16 \n\t"
|
||||
"vfasb %%v1,%%v1,%%v17 \n\t"
|
||||
"vfasb %%v2,%%v2,%%v18 \n\t"
|
||||
"vfasb %%v3,%%v3,%%v19 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v20 \n\t"
|
||||
"vfasb %%v1,%%v1,%%v21 \n\t"
|
||||
"vfasb %%v2,%%v2,%%v22 \n\t"
|
||||
"vfasb %%v3,%%v3,%%v23 \n\t"
|
||||
|
||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
||||
"vl %%v20, 192(%%r1,%2) \n\t"
|
||||
"vl %%v21, 208(%%r1,%2) \n\t"
|
||||
"vl %%v22, 224(%%r1,%2) \n\t"
|
||||
"vl %%v23, 240(%%r1,%2) \n\t"
|
||||
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfasb %%v0,%%v0,%%v16 \n\t"
|
||||
"vfasb %%v1,%%v1,%%v17 \n\t"
|
||||
"vfasb %%v2,%%v2,%%v18 \n\t"
|
||||
"vfasb %%v3,%%v3,%%v19 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v20 \n\t"
|
||||
"vfasb %%v1,%%v1,%%v21 \n\t"
|
||||
"vfasb %%v2,%%v2,%%v22 \n\t"
|
||||
"vfasb %%v3,%%v3,%%v23 \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
"vfasb %%v0,%%v0,%%v1 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v2 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v3 \n\t"
|
||||
"veslg %%v1,%%v0,32 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v1 \n\t"
|
||||
"vrepf %%v1,%%v0,2 \n\t"
|
||||
"aebr %%f0,%%f1 \n\t"
|
||||
"ler %0,%%f0 "
|
||||
:"=f"(asum)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
);
|
||||
|
||||
return asum;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT sumf = 0.0;
|
||||
BLASLONG n1;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return sumf;
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
n1 = n & -64;
|
||||
|
||||
if (n1 > 0) {
|
||||
|
||||
sumf = sasum_kernel_64(n1, x);
|
||||
i = n1;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
sumf += ABS(x[i]);
|
||||
i++;
|
||||
}
|
||||
|
||||
} else {
|
||||
BLASLONG n1 = n & -4;
|
||||
register FLOAT sum1, sum2;
|
||||
sum1 = 0.0;
|
||||
sum2 = 0.0;
|
||||
while (j < n1) {
|
||||
|
||||
sum1 += ABS(x[i]);
|
||||
sum2 += ABS(x[i + inc_x]);
|
||||
sum1 += ABS(x[i + 2 * inc_x]);
|
||||
sum2 += ABS(x[i + 3 * inc_x]);
|
||||
|
||||
i += inc_x * 4;
|
||||
j += 4;
|
||||
|
||||
}
|
||||
sumf = sum1 + sum2;
|
||||
while (j < n) {
|
||||
|
||||
sumf += ABS(x[i]);
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
return sumf;
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,184 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vlrepf %%v0,%3 \n\t"
|
||||
"srlg %%r0,%0,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,16(%%r1,%1) \n\t"
|
||||
"vl %%v18,32(%%r1,%1) \n\t"
|
||||
"vl %%v19,48(%%r1,%1) \n\t"
|
||||
"vl %%v20,0(%%r1,%2) \n\t"
|
||||
"vl %%v21,16(%%r1,%2) \n\t"
|
||||
"vl %%v22,32(%%r1,%2) \n\t"
|
||||
"vl %%v23,48(%%r1,%2) \n\t"
|
||||
|
||||
"vfmasb %%v16,%%v0,%%v16,%%v20 \n\t"
|
||||
"vfmasb %%v17,%%v0,%%v17,%%v21 \n\t"
|
||||
"vfmasb %%v18,%%v0,%%v18,%%v22 \n\t"
|
||||
"vfmasb %%v19,%%v0,%%v19,%%v23 \n\t"
|
||||
|
||||
"vl %%v24,64(%%r1,%1) \n\t"
|
||||
"vl %%v25,80(%%r1,%1) \n\t"
|
||||
"vl %%v26,96(%%r1,%1) \n\t"
|
||||
"vl %%v27,112(%%r1,%1) \n\t"
|
||||
"vl %%v28,64(%%r1,%2) \n\t"
|
||||
"vl %%v29,80(%%r1,%2) \n\t"
|
||||
"vl %%v30,96(%%r1,%2) \n\t"
|
||||
"vl %%v31,112(%%r1,%2) \n\t"
|
||||
|
||||
"vfmasb %%v20,%%v0,%%v24,%%v28 \n\t"
|
||||
"vfmasb %%v21,%%v0,%%v25,%%v29 \n\t"
|
||||
"vfmasb %%v22,%%v0,%%v26,%%v30 \n\t"
|
||||
"vfmasb %%v23,%%v0,%%v27,%%v31 \n\t"
|
||||
|
||||
"vst %%v16,0(%%r1,%2) \n\t"
|
||||
"vst %%v17,16(%%r1,%2) \n\t"
|
||||
"vst %%v18,32(%%r1,%2) \n\t"
|
||||
"vst %%v19,48(%%r1,%2) \n\t"
|
||||
"vst %%v20,64(%%r1,%2) \n\t"
|
||||
"vst %%v21,80(%%r1,%2) \n\t"
|
||||
"vst %%v22,96(%%r1,%2) \n\t"
|
||||
"vst %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%1) \n\t"
|
||||
"vl %%v17,144(%%r1,%1) \n\t"
|
||||
"vl %%v18,160(%%r1,%1) \n\t"
|
||||
"vl %%v19,176(%%r1,%1) \n\t"
|
||||
"vl %%v20,128(%%r1,%2) \n\t"
|
||||
"vl %%v21,144(%%r1,%2) \n\t"
|
||||
"vl %%v22,160(%%r1,%2) \n\t"
|
||||
"vl %%v23,176(%%r1,%2) \n\t"
|
||||
|
||||
"vfmasb %%v16,%%v0,%%v16,%%v20 \n\t"
|
||||
"vfmasb %%v17,%%v0,%%v17,%%v21 \n\t"
|
||||
"vfmasb %%v18,%%v0,%%v18,%%v22 \n\t"
|
||||
"vfmasb %%v19,%%v0,%%v19,%%v23 \n\t"
|
||||
|
||||
"vl %%v24,192(%%r1,%1) \n\t"
|
||||
"vl %%v25,208(%%r1,%1) \n\t"
|
||||
"vl %%v26,224(%%r1,%1) \n\t"
|
||||
"vl %%v27,240(%%r1,%1) \n\t"
|
||||
"vl %%v28,192(%%r1,%2) \n\t"
|
||||
"vl %%v29,208(%%r1,%2) \n\t"
|
||||
"vl %%v30,224(%%r1,%2) \n\t"
|
||||
"vl %%v31,240(%%r1,%2) \n\t"
|
||||
|
||||
"vfmasb %%v20,%%v0,%%v24,%%v28 \n\t"
|
||||
"vfmasb %%v21,%%v0,%%v25,%%v29 \n\t"
|
||||
"vfmasb %%v22,%%v0,%%v26,%%v30 \n\t"
|
||||
"vfmasb %%v23,%%v0,%%v27,%%v31 \n\t"
|
||||
|
||||
"vst %%v16,128(%%r1,%2) \n\t"
|
||||
"vst %%v17,144(%%r1,%2) \n\t"
|
||||
"vst %%v18,160(%%r1,%2) \n\t"
|
||||
"vst %%v19,176(%%r1,%2) \n\t"
|
||||
"vst %%v20,192(%%r1,%2) \n\t"
|
||||
"vst %%v21,208(%%r1,%2) \n\t"
|
||||
"vst %%v22,224(%%r1,%2) \n\t"
|
||||
"vst %%v23,240(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if ( n <= 0 ) return 0 ;
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
|
||||
if ( n1 )
|
||||
saxpy_kernel_64(n1, x, y , &da);
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
y[i] += da * x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return 0 ;
|
||||
|
||||
|
||||
}
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
|
||||
while(i < n1)
|
||||
{
|
||||
|
||||
FLOAT m1 = da * x[ix] ;
|
||||
FLOAT m2 = da * x[ix+inc_x] ;
|
||||
FLOAT m3 = da * x[ix+2*inc_x] ;
|
||||
FLOAT m4 = da * x[ix+3*inc_x] ;
|
||||
|
||||
y[iy] += m1 ;
|
||||
y[iy+inc_y] += m2 ;
|
||||
y[iy+2*inc_y] += m3 ;
|
||||
y[iy+3*inc_y] += m4 ;
|
||||
|
||||
ix += inc_x*4 ;
|
||||
iy += inc_y*4 ;
|
||||
i+=4 ;
|
||||
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
y[iy] += da * x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return 0 ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"lgr %%r1,%1 \n\t"
|
||||
"lgr %%r2,%2 \n\t"
|
||||
"srlg %%r0,%0,6 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1) \n\t"
|
||||
"pfd 2, 1024(%%r2) \n\t"
|
||||
"mvc 0(256,%%r2),0(%%r1) \n\t"
|
||||
"agfi %%r1,256 \n\t"
|
||||
"agfi %%r2,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y)
|
||||
:"memory","cc","r0","r1","r2"
|
||||
);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0, iy = 0;
|
||||
|
||||
if (n <= 0) return 0;
|
||||
|
||||
if ((inc_x == 1) && (inc_y == 1)) {
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if (n1 > 0) {
|
||||
scopy_kernel_64(n1, x, y);
|
||||
i = n1;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
y[i] = x[i];
|
||||
i++;
|
||||
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
|
||||
while (i < n) {
|
||||
|
||||
y[iy] = x[ix];
|
||||
ix += inc_x;
|
||||
iy += inc_y;
|
||||
i++;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return 0;
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,140 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2018,The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms,with or without
|
||||
modification,are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice,this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice,this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
FLOAT dot;
|
||||
|
||||
__asm__ volatile (
|
||||
"vzero %%v0 \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 2,1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%3) \n\t"
|
||||
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,16(%%r1,%3) \n\t"
|
||||
"vfmasb %%v0,%%v17,%%v25,%%v0 \n\t"
|
||||
"vl %%v26,32(%%r1,%3) \n\t"
|
||||
"vfmasb %%v0,%%v18,%%v26,%%v0 \n\t"
|
||||
"vl %%v27,48(%%r1,%3) \n\t"
|
||||
"vfmasb %%v0,%%v19,%%v27,%%v0 \n\t"
|
||||
"vl %%v28,64(%%r1,%3) \n\t"
|
||||
"vfmasb %%v0,%%v20,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,80(%%r1,%3) \n\t"
|
||||
"vfmasb %%v0,%%v21,%%v29,%%v0 \n\t"
|
||||
"vl %%v30,96(%%r1,%3) \n\t"
|
||||
"vfmasb %%v0,%%v22,%%v30,%%v0 \n\t"
|
||||
"vl %%v31,112(%%r1,%3) \n\t"
|
||||
"vfmasb %%v0,%%v23,%%v31,%%v0 \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
"vrepf %%v1,%%v0,1 \n\t"
|
||||
"vrepf %%v2,%%v0,2 \n\t"
|
||||
"vrepf %%v3,%%v0,3 \n\t"
|
||||
"aebr %%f0,%%f1 \n\t"
|
||||
"aebr %%f0,%%f2 \n\t"
|
||||
"aebr %%f0,%%f3 \n\t"
|
||||
"ler %0,%%f0 "
|
||||
:"=f"(dot)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return dot;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
FLOAT dot = 0.0 ;
|
||||
|
||||
if ( n <= 0 ) return(dot);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
|
||||
if ( n1 )
|
||||
dot = sdot_kernel_32(n1,x,y);
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[i] * x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(dot);
|
||||
|
||||
|
||||
}
|
||||
|
||||
BLASLONG n1 = n & -2;
|
||||
|
||||
while(i < n1)
|
||||
{
|
||||
|
||||
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
|
||||
ix += inc_x*2 ;
|
||||
iy += inc_y*2 ;
|
||||
i+=2 ;
|
||||
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[iy] * x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(dot);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,668 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define NBMAX 2048
|
||||
|
||||
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vlrepf %%v0,0(%5) \n\t"
|
||||
"vlrepf %%v1,4(%5) \n\t"
|
||||
"vlrepf %%v2,8(%5) \n\t"
|
||||
"vlrepf %%v3,12(%5) \n\t"
|
||||
"vlrepf %%v4,%7 \n\t"
|
||||
"vfmsb %%v0,%%v0,%%v4 \n\t"
|
||||
"vfmsb %%v1,%%v1,%%v4 \n\t"
|
||||
"vfmsb %%v2,%%v2,%%v4 \n\t"
|
||||
"vfmsb %%v3,%%v3,%%v4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
"lghi %%r0,-32 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,5 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 1,1024(%%r1,%3) \n\t"
|
||||
"pfd 1,1024(%%r1,%4) \n\t"
|
||||
"pfd 2,1024(%%r1,%6) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,0(%%r1,%2) \n\t"
|
||||
"vl %%v18,0(%%r1,%3) \n\t"
|
||||
"vl %%v19,0(%%r1,%4) \n\t"
|
||||
"vl %%v20,16(%%r1,%1) \n\t"
|
||||
"vl %%v21,16(%%r1,%2) \n\t"
|
||||
"vl %%v22,16(%%r1,%3) \n\t"
|
||||
"vl %%v23,16(%%r1,%4) \n\t"
|
||||
"vl %%v24,32(%%r1,%1) \n\t"
|
||||
"vl %%v25,32(%%r1,%2) \n\t"
|
||||
"vl %%v26,32(%%r1,%3) \n\t"
|
||||
"vl %%v27,32(%%r1,%4) \n\t"
|
||||
"vl %%v28,48(%%r1,%1) \n\t"
|
||||
"vl %%v29,48(%%r1,%2) \n\t"
|
||||
"vl %%v30,48(%%r1,%3) \n\t"
|
||||
"vl %%v31,48(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v4,0(%%r1,%6) \n\t"
|
||||
"vfmasb %%v4,%%v16,%%v0,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v17,%%v1,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v18,%%v2,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v19,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,0(%%r1,%6) \n\t"
|
||||
|
||||
"vl %%v4,16(%%r1,%6) \n\t"
|
||||
"vfmasb %%v4,%%v20,%%v0,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v21,%%v1,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v22,%%v2,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v23,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,16(%%r1,%6) \n\t"
|
||||
|
||||
"vl %%v4,32(%%r1,%6) \n\t"
|
||||
"vfmasb %%v4,%%v24,%%v0,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v25,%%v1,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v26,%%v2,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v27,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,32(%%r1,%6) \n\t"
|
||||
|
||||
"vl %%v4,48(%%r1,%6) \n\t"
|
||||
"vfmasb %%v4,%%v28,%%v0,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v29,%%v1,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v30,%%v2,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v31,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,48(%%r1,%6) \n\t"
|
||||
|
||||
"vl %%v16,64(%%r1,%1) \n\t"
|
||||
"vl %%v17,64(%%r1,%2) \n\t"
|
||||
"vl %%v18,64(%%r1,%3) \n\t"
|
||||
"vl %%v19,64(%%r1,%4) \n\t"
|
||||
"vl %%v20,80(%%r1,%1) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,80(%%r1,%3) \n\t"
|
||||
"vl %%v23,80(%%r1,%4) \n\t"
|
||||
"vl %%v24,96(%%r1,%1) \n\t"
|
||||
"vl %%v25,96(%%r1,%2) \n\t"
|
||||
"vl %%v26,96(%%r1,%3) \n\t"
|
||||
"vl %%v27,96(%%r1,%4) \n\t"
|
||||
"vl %%v28,112(%%r1,%1) \n\t"
|
||||
"vl %%v29,112(%%r1,%2) \n\t"
|
||||
"vl %%v30,112(%%r1,%3) \n\t"
|
||||
"vl %%v31,112(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v4,64(%%r1,%6) \n\t"
|
||||
"vfmasb %%v4,%%v16,%%v0,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v17,%%v1,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v18,%%v2,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v19,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,64(%%r1,%6) \n\t"
|
||||
|
||||
"vl %%v4,80(%%r1,%6) \n\t"
|
||||
"vfmasb %%v4,%%v20,%%v0,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v21,%%v1,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v22,%%v2,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v23,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,80(%%r1,%6) \n\t"
|
||||
|
||||
"vl %%v4,96(%%r1,%6) \n\t"
|
||||
"vfmasb %%v4,%%v24,%%v0,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v25,%%v1,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v26,%%v2,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v27,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,96(%%r1,%6) \n\t"
|
||||
|
||||
"vl %%v4,112(%%r1,%6) \n\t"
|
||||
"vfmasb %%v4,%%v28,%%v0,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v29,%%v1,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v30,%%v2,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v31,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,112(%%r1,%6) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,28 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,0(%%r1,%2) \n\t"
|
||||
"vl %%v18,0(%%r1,%3) \n\t"
|
||||
"vl %%v19,0(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v4,0(%%r1,%6) \n\t"
|
||||
"vfmasb %%v4,%%v16,%%v0,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v17,%%v1,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v18,%%v2,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v19,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,0(%%r1,%6) \n\t"
|
||||
|
||||
"agfi %%r1,16 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"nop "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vlrepf %%v0,0(%3) \n\t"
|
||||
"vlrepf %%v1,4(%3) \n\t"
|
||||
"vlrepf %%v2,%5 \n\t"
|
||||
"vfmsb %%v0,%%v0,%%v2 \n\t"
|
||||
"vfmsb %%v1,%%v1,%%v2 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
"lghi %%r0,-32 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,5 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 2,1024(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,0(%%r1,%2) \n\t"
|
||||
"vl %%v18,16(%%r1,%1) \n\t"
|
||||
"vl %%v19,16(%%r1,%2) \n\t"
|
||||
"vl %%v20,32(%%r1,%1) \n\t"
|
||||
"vl %%v21,32(%%r1,%2) \n\t"
|
||||
"vl %%v22,48(%%r1,%1) \n\t"
|
||||
"vl %%v23,48(%%r1,%2) \n\t"
|
||||
"vl %%v24,64(%%r1,%1) \n\t"
|
||||
"vl %%v25,64(%%r1,%2) \n\t"
|
||||
"vl %%v26,80(%%r1,%1) \n\t"
|
||||
"vl %%v27,80(%%r1,%2) \n\t"
|
||||
"vl %%v28,96(%%r1,%1) \n\t"
|
||||
"vl %%v29,96(%%r1,%2) \n\t"
|
||||
"vl %%v30,112(%%r1,%1) \n\t"
|
||||
"vl %%v31,112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v2,0(%%r1,%4) \n\t"
|
||||
"vfmasb %%v2,%%v16,%%v0,%%v2 \n\t"
|
||||
"vfmasb %%v2,%%v17,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,0(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,16(%%r1,%4) \n\t"
|
||||
"vfmasb %%v2,%%v18,%%v0,%%v2 \n\t"
|
||||
"vfmasb %%v2,%%v19,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,16(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,32(%%r1,%4) \n\t"
|
||||
"vfmasb %%v2,%%v20,%%v0,%%v2 \n\t"
|
||||
"vfmasb %%v2,%%v21,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,32(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,48(%%r1,%4) \n\t"
|
||||
"vfmasb %%v2,%%v22,%%v0,%%v2 \n\t"
|
||||
"vfmasb %%v2,%%v23,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,48(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,64(%%r1,%4) \n\t"
|
||||
"vfmasb %%v2,%%v24,%%v0,%%v2 \n\t"
|
||||
"vfmasb %%v2,%%v25,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,64(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,80(%%r1,%4) \n\t"
|
||||
"vfmasb %%v2,%%v26,%%v0,%%v2 \n\t"
|
||||
"vfmasb %%v2,%%v27,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,80(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,96(%%r1,%4) \n\t"
|
||||
"vfmasb %%v2,%%v28,%%v0,%%v2 \n\t"
|
||||
"vfmasb %%v2,%%v29,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,96(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,112(%%r1,%4) \n\t"
|
||||
"vfmasb %%v2,%%v30,%%v0,%%v2 \n\t"
|
||||
"vfmasb %%v2,%%v31,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,112(%%r1,%4) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,28 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,0(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v2,0(%%r1,%4) \n\t"
|
||||
"vfmasb %%v2,%%v16,%%v0,%%v2 \n\t"
|
||||
"vfmasb %%v2,%%v17,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,0(%%r1,%4) \n\t"
|
||||
|
||||
"agfi %%r1,16 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"nop "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vlrepf %%v0,0(%2) \n\t"
|
||||
"vlrepf %%v1,%4 \n\t"
|
||||
"vfmsb %%v0,%%v0,%%v1 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
"lghi %%r0,-32 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,5 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 2,1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,16(%%r1,%1) \n\t"
|
||||
"vl %%v18,32(%%r1,%1) \n\t"
|
||||
"vl %%v19,48(%%r1,%1) \n\t"
|
||||
"vl %%v20,64(%%r1,%1) \n\t"
|
||||
"vl %%v21,80(%%r1,%1) \n\t"
|
||||
"vl %%v22,96(%%r1,%1) \n\t"
|
||||
"vl %%v23,112(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v1,0(%%r1,%3) \n\t"
|
||||
"vfmasb %%v1,%%v16,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,0(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,16(%%r1,%3) \n\t"
|
||||
"vfmasb %%v1,%%v17,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,16(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,32(%%r1,%3) \n\t"
|
||||
"vfmasb %%v1,%%v18,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,32(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,48(%%r1,%3) \n\t"
|
||||
"vfmasb %%v1,%%v19,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,48(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,64(%%r1,%3) \n\t"
|
||||
"vfmasb %%v1,%%v20,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,64(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,80(%%r1,%3) \n\t"
|
||||
"vfmasb %%v1,%%v21,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,80(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,96(%%r1,%3) \n\t"
|
||||
"vfmasb %%v1,%%v22,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,96(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,112(%%r1,%3) \n\t"
|
||||
"vfmasb %%v1,%%v23,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,112(%%r1,%3) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,28 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v1,0(%%r1,%3) \n\t"
|
||||
"vfmasb %%v1,%%v16,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,0(%%r1,%3) \n\t"
|
||||
|
||||
"agfi %%r1,16 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"nop "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
{
|
||||
BLASLONG i;
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
*dest += src[i];
|
||||
dest += inc_dest;
|
||||
}
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
FLOAT *ap[4];
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
BLASLONG lda4 = lda << 2;
|
||||
FLOAT xbuffer[8],*ybuffer;
|
||||
|
||||
if ( m < 1 ) return(0);
|
||||
if ( n < 1 ) return(0);
|
||||
|
||||
ybuffer = buffer;
|
||||
|
||||
n1 = n >> 2 ;
|
||||
n2 = n & 3 ;
|
||||
|
||||
m3 = m & 3 ;
|
||||
m1 = m & -4 ;
|
||||
m2 = (m & (NBMAX-1)) - m3 ;
|
||||
|
||||
y_ptr = y;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while ( NB == NBMAX )
|
||||
{
|
||||
|
||||
m1 -= NB;
|
||||
if ( m1 < 0)
|
||||
{
|
||||
if ( m2 == 0 ) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
|
||||
ap[0] = a_ptr;
|
||||
ap[1] = a_ptr + lda;
|
||||
ap[2] = ap[1] + lda;
|
||||
ap[3] = ap[2] + lda;
|
||||
|
||||
if ( inc_y != 1 )
|
||||
memset(ybuffer,0,NB*8);
|
||||
else
|
||||
ybuffer = y_ptr;
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
ap[2] += lda4;
|
||||
ap[3] += lda4;
|
||||
a_ptr += lda4;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
if ( n2 & 2 )
|
||||
{
|
||||
sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
|
||||
a_ptr += lda*2;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
|
||||
if ( n2 & 1 )
|
||||
{
|
||||
sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
|
||||
a_ptr += lda;
|
||||
x_ptr += 1;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
xbuffer[0] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[1] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[2] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[3] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
ap[2] += lda4;
|
||||
ap[3] += lda4;
|
||||
a_ptr += lda4;
|
||||
}
|
||||
|
||||
for( i = 0; i < n2 ; i++)
|
||||
{
|
||||
xbuffer[0] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
|
||||
a_ptr += lda;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
a += NB;
|
||||
if ( inc_y != 1 )
|
||||
{
|
||||
add_y(NB,ybuffer,y_ptr,inc_y);
|
||||
y_ptr += NB * inc_y;
|
||||
}
|
||||
else
|
||||
y_ptr += NB ;
|
||||
|
||||
}
|
||||
|
||||
if ( m3 == 0 ) return(0);
|
||||
|
||||
if ( m3 == 3 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
FLOAT temp2 = 0.0;
|
||||
if ( lda == 3 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < ( n & -4 ); i+=4 )
|
||||
{
|
||||
|
||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
|
||||
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
|
||||
|
||||
temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
|
||||
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
|
||||
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
|
||||
|
||||
a_ptr += 12;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
temp2 += a_ptr[2] * x_ptr[0];
|
||||
a_ptr += 3;
|
||||
x_ptr ++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
temp2 += a_ptr[2] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp1;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp2;
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
if ( m3 == 2 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
if ( lda == 2 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < (n & -4) ; i+=4 )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
|
||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
|
||||
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
|
||||
a_ptr += 8;
|
||||
x_ptr += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
a_ptr += 2;
|
||||
x_ptr ++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp1;
|
||||
return(0);
|
||||
}
|
||||
|
||||
if ( m3 == 1 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp = 0.0;
|
||||
if ( lda == 1 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < (n & -4); i+=4 )
|
||||
{
|
||||
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
|
||||
|
||||
}
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp += a_ptr[i] * x_ptr[i];
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp += a_ptr[0] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp;
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,826 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define NBMAX 2048
|
||||
|
||||
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"vzero %%v2 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
"lghi %%r0,-32 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,5 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 1,1024(%%r1,%3) \n\t"
|
||||
"pfd 1,1024(%%r1,%4) \n\t"
|
||||
"pfd 1,1024(%%r1,%5) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%5) \n\t"
|
||||
"vl %%v17,16(%%r1,%5) \n\t"
|
||||
"vl %%v18,32(%%r1,%5) \n\t"
|
||||
"vl %%v19,48(%%r1,%5) \n\t"
|
||||
"vl %%v20,64(%%r1,%5) \n\t"
|
||||
"vl %%v21,80(%%r1,%5) \n\t"
|
||||
"vl %%v22,96(%%r1,%5) \n\t"
|
||||
"vl %%v23,112(%%r1,%5) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,0(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v16,%%v25,%%v1 \n\t"
|
||||
"vl %%v26,0(%%r1,%3) \n\t"
|
||||
"vfmasb %%v2,%%v16,%%v26,%%v2 \n\t"
|
||||
"vl %%v27,0(%%r1,%4) \n\t"
|
||||
"vfmasb %%v3,%%v16,%%v27,%%v3 \n\t"
|
||||
|
||||
"vl %%v28,16(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v17,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,16(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v17,%%v29,%%v1 \n\t"
|
||||
"vl %%v30,16(%%r1,%3) \n\t"
|
||||
"vfmasb %%v2,%%v17,%%v30,%%v2 \n\t"
|
||||
"vl %%v31,16(%%r1,%4) \n\t"
|
||||
"vfmasb %%v3,%%v17,%%v31,%%v3 \n\t"
|
||||
|
||||
"vl %%v24,32(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v18,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,32(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v18,%%v25,%%v1 \n\t"
|
||||
"vl %%v26,32(%%r1,%3) \n\t"
|
||||
"vfmasb %%v2,%%v18,%%v26,%%v2 \n\t"
|
||||
"vl %%v27,32(%%r1,%4) \n\t"
|
||||
"vfmasb %%v3,%%v18,%%v27,%%v3 \n\t"
|
||||
|
||||
"vl %%v28,48(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v19,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,48(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v19,%%v29,%%v1 \n\t"
|
||||
"vl %%v30,48(%%r1,%3) \n\t"
|
||||
"vfmasb %%v2,%%v19,%%v30,%%v2 \n\t"
|
||||
"vl %%v31,48(%%r1,%4) \n\t"
|
||||
"vfmasb %%v3,%%v19,%%v31,%%v3 \n\t"
|
||||
|
||||
"vl %%v24,64(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v20,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,64(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v20,%%v25,%%v1 \n\t"
|
||||
"vl %%v26,64(%%r1,%3) \n\t"
|
||||
"vfmasb %%v2,%%v20,%%v26,%%v2 \n\t"
|
||||
"vl %%v27,64(%%r1,%4) \n\t"
|
||||
"vfmasb %%v3,%%v20,%%v27,%%v3 \n\t"
|
||||
|
||||
"vl %%v28,80(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v21,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,80(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v21,%%v29,%%v1 \n\t"
|
||||
"vl %%v30,80(%%r1,%3) \n\t"
|
||||
"vfmasb %%v2,%%v21,%%v30,%%v2 \n\t"
|
||||
"vl %%v31,80(%%r1,%4) \n\t"
|
||||
"vfmasb %%v3,%%v21,%%v31,%%v3 \n\t"
|
||||
|
||||
"vl %%v24,96(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v22,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,96(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v22,%%v25,%%v1 \n\t"
|
||||
"vl %%v26,96(%%r1,%3) \n\t"
|
||||
"vfmasb %%v2,%%v22,%%v26,%%v2 \n\t"
|
||||
"vl %%v27,96(%%r1,%4) \n\t"
|
||||
"vfmasb %%v3,%%v22,%%v27,%%v3 \n\t"
|
||||
|
||||
"vl %%v28,112(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v23,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,112(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v23,%%v29,%%v1 \n\t"
|
||||
"vl %%v30,112(%%r1,%3) \n\t"
|
||||
"vfmasb %%v2,%%v23,%%v30,%%v2 \n\t"
|
||||
"vl %%v31,112(%%r1,%4) \n\t"
|
||||
"vfmasb %%v3,%%v23,%%v31,%%v3 \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,28 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%5) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,0(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v16,%%v25,%%v1 \n\t"
|
||||
"vl %%v26,0(%%r1,%3) \n\t"
|
||||
"vfmasb %%v2,%%v16,%%v26,%%v2 \n\t"
|
||||
"vl %%v27,0(%%r1,%4) \n\t"
|
||||
"vfmasb %%v3,%%v16,%%v27,%%v3 \n\t"
|
||||
|
||||
"agfi %%r1,16 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
"vrepf %%v4,%%v0,1 \n\t"
|
||||
"aebr %%f0,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v0,2 \n\t"
|
||||
"aebr %%f0,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v0,3 \n\t"
|
||||
"aebr %%f0,%%f4 \n\t"
|
||||
"ste %%f0,0(%6) \n\t"
|
||||
"vrepf %%v4,%%v1,1 \n\t"
|
||||
"aebr %%f1,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v1,2 \n\t"
|
||||
"aebr %%f1,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v1,3 \n\t"
|
||||
"aebr %%f1,%%f4 \n\t"
|
||||
"ste %%f1,4(%6) \n\t"
|
||||
"vrepf %%v4,%%v2,1 \n\t"
|
||||
"aebr %%f2,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v2,2 \n\t"
|
||||
"aebr %%f2,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v2,3 \n\t"
|
||||
"aebr %%f2,%%f4 \n\t"
|
||||
"ste %%f2,8(%6) \n\t"
|
||||
"vrepf %%v4,%%v3,1 \n\t"
|
||||
"aebr %%f3,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v3,2 \n\t"
|
||||
"aebr %%f3,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v3,3 \n\t"
|
||||
"aebr %%f3,%%f4 \n\t"
|
||||
"ste %%f3,12(%6) "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
"lghi %%r0,-32 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,5 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 1,1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v17,16(%%r1,%3) \n\t"
|
||||
"vl %%v18,32(%%r1,%3) \n\t"
|
||||
"vl %%v19,48(%%r1,%3) \n\t"
|
||||
"vl %%v20,64(%%r1,%3) \n\t"
|
||||
"vl %%v21,80(%%r1,%3) \n\t"
|
||||
"vl %%v22,96(%%r1,%3) \n\t"
|
||||
"vl %%v23,112(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,0(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v16,%%v25,%%v1 \n\t"
|
||||
|
||||
"vl %%v26,16(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v17,%%v26,%%v0 \n\t"
|
||||
"vl %%v27,16(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v17,%%v27,%%v1 \n\t"
|
||||
|
||||
"vl %%v28,32(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v18,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,32(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v18,%%v29,%%v1 \n\t"
|
||||
|
||||
"vl %%v30,48(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v19,%%v30,%%v0 \n\t"
|
||||
"vl %%v31,48(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v19,%%v31,%%v1 \n\t"
|
||||
|
||||
"vl %%v24,64(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v20,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,64(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v20,%%v25,%%v1 \n\t"
|
||||
|
||||
"vl %%v26,80(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v21,%%v26,%%v0 \n\t"
|
||||
"vl %%v27,80(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v21,%%v27,%%v1 \n\t"
|
||||
|
||||
"vl %%v28,96(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v22,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,96(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v22,%%v29,%%v1 \n\t"
|
||||
|
||||
"vl %%v30,112(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v23,%%v30,%%v0 \n\t"
|
||||
"vl %%v31,112(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v23,%%v31,%%v1 \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,28 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,0(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v16,%%v25,%%v1 \n\t"
|
||||
|
||||
"agfi %%r1,16 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"vrepf %%v2,%%v0,1 \n\t"
|
||||
"aebr %%f0,%%f2 \n\t"
|
||||
"vrepf %%v2,%%v0,2 \n\t"
|
||||
"aebr %%f0,%%f2 \n\t"
|
||||
"vrepf %%v2,%%v0,3 \n\t"
|
||||
"aebr %%f0,%%f2 \n\t"
|
||||
"ste %%f0,0(%4) \n\t"
|
||||
"vrepf %%v2,%%v1,1 \n\t"
|
||||
"aebr %%f1,%%f2 \n\t"
|
||||
"vrepf %%v2,%%v1,2 \n\t"
|
||||
"aebr %%f1,%%f2 \n\t"
|
||||
"vrepf %%v2,%%v1,3 \n\t"
|
||||
"aebr %%f1,%%f2 \n\t"
|
||||
"ste %%f1,4(%4) "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vzero %%v0 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
"lghi %%r0,-32 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,5 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
|
||||
"vl %%v25,16(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v17,%%v25,%%v0 \n\t"
|
||||
|
||||
"vl %%v26,32(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v18,%%v26,%%v0 \n\t"
|
||||
|
||||
"vl %%v27,48(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v19,%%v27,%%v0 \n\t"
|
||||
|
||||
"vl %%v28,64(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v20,%%v28,%%v0 \n\t"
|
||||
|
||||
"vl %%v29,80(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v21,%%v29,%%v0 \n\t"
|
||||
|
||||
"vl %%v30,96(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v22,%%v30,%%v0 \n\t"
|
||||
|
||||
"vl %%v31,112(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v23,%%v31,%%v0 \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,28 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
|
||||
"agfi %%r1,16 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"vrepf %%v1,%%v0,1 \n\t"
|
||||
"aebr %%f0,%%f1 \n\t"
|
||||
"vrepf %%v1,%%v0,2 \n\t"
|
||||
"aebr %%f0,%%f1 \n\t"
|
||||
"vrepf %%v1,%%v0,3 \n\t"
|
||||
"aebr %%f0,%%f1 \n\t"
|
||||
"ste %%f0,0(%3) "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
|
||||
{
|
||||
BLASLONG i;
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
dest[i] = *src;
|
||||
src += inc_src;
|
||||
}
|
||||
}
|
||||
|
||||
static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vlrepf %%v0,%1 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
"lghi %%r0,-32 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,5 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 2,1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 0(%%r1,%3) \n\t"
|
||||
"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t"
|
||||
"vst %%v24, 0(%%r1,%3) \n\t"
|
||||
"vl %%v25, 16(%%r1,%3) \n\t"
|
||||
"vfmasb %%v25,%%v17,%%v0,%%v25 \n\t"
|
||||
"vst %%v25, 16(%%r1,%3) \n\t"
|
||||
"vl %%v26, 32(%%r1,%3) \n\t"
|
||||
"vfmasb %%v26,%%v18,%%v0,%%v26 \n\t"
|
||||
"vst %%v26, 32(%%r1,%3) \n\t"
|
||||
"vl %%v27, 48(%%r1,%3) \n\t"
|
||||
"vfmasb %%v27,%%v19,%%v0,%%v27 \n\t"
|
||||
"vst %%v27, 48(%%r1,%3) \n\t"
|
||||
"vl %%v28, 64(%%r1,%3) \n\t"
|
||||
"vfmasb %%v28,%%v20,%%v0,%%v28 \n\t"
|
||||
"vst %%v28, 64(%%r1,%3) \n\t"
|
||||
"vl %%v29, 80(%%r1,%3) \n\t"
|
||||
"vfmasb %%v29,%%v21,%%v0,%%v29 \n\t"
|
||||
"vst %%v29, 80(%%r1,%3) \n\t"
|
||||
"vl %%v30, 96(%%r1,%3) \n\t"
|
||||
"vfmasb %%v30,%%v22,%%v0,%%v30 \n\t"
|
||||
"vst %%v30, 96(%%r1,%3) \n\t"
|
||||
"vl %%v31, 112(%%r1,%3) \n\t"
|
||||
"vfmasb %%v31,%%v23,%%v0,%%v31 \n\t"
|
||||
"vst %%v31, 112(%%r1,%3) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,28 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 0(%%r1,%3) \n\t"
|
||||
"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t"
|
||||
"vst %%v24, 0(%%r1,%3) \n\t"
|
||||
|
||||
"agfi %%r1,16 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"nop "
|
||||
:
|
||||
:"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
{
|
||||
if (inc_dest == 1)
|
||||
add_y_kernel_4(n, da, src, dest);
|
||||
else
|
||||
{
|
||||
BLASLONG i;
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
*dest += src[i] * da;
|
||||
dest += inc_dest;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG register i;
|
||||
BLASLONG register j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
BLASLONG n0;
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
FLOAT ybuffer[2] __attribute__ ((aligned(16)));
|
||||
FLOAT *xbuffer;
|
||||
FLOAT *ytemp;
|
||||
|
||||
if ( m < 1 ) return(0);
|
||||
if ( n < 1 ) return(0);
|
||||
|
||||
xbuffer = buffer;
|
||||
ytemp = buffer + (m < NBMAX ? m : NBMAX);
|
||||
|
||||
n0 = n / NBMAX;
|
||||
n1 = (n % NBMAX) >> 2 ;
|
||||
n2 = n & 3 ;
|
||||
|
||||
m3 = m & 3 ;
|
||||
m1 = m & -4 ;
|
||||
m2 = (m & (NBMAX-1)) - m3 ;
|
||||
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while ( NB == NBMAX )
|
||||
{
|
||||
m1 -= NB;
|
||||
if ( m1 < 0)
|
||||
{
|
||||
if ( m2 == 0 ) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
y_ptr = y;
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
|
||||
if ( inc_x == 1 )
|
||||
xbuffer = x_ptr;
|
||||
else
|
||||
copy_x(NB,x_ptr,xbuffer,inc_x);
|
||||
|
||||
|
||||
FLOAT *ap[4];
|
||||
FLOAT *yp;
|
||||
BLASLONG register lda4 = 4 * lda;
|
||||
ap[0] = a_ptr;
|
||||
ap[1] = a_ptr + lda;
|
||||
ap[2] = ap[1] + lda;
|
||||
ap[3] = ap[2] + lda;
|
||||
|
||||
if ( n0 > 0 )
|
||||
{
|
||||
BLASLONG nb1 = NBMAX / 4;
|
||||
for( j=0; j<n0; j++)
|
||||
{
|
||||
|
||||
yp = ytemp;
|
||||
for( i = 0; i < nb1 ; i++)
|
||||
{
|
||||
sgemv_kernel_4x4(NB,ap,xbuffer,yp);
|
||||
ap[0] += lda4 ;
|
||||
ap[1] += lda4 ;
|
||||
ap[2] += lda4 ;
|
||||
ap[3] += lda4 ;
|
||||
yp += 4;
|
||||
}
|
||||
add_y(nb1*4, alpha, ytemp, y_ptr, inc_y );
|
||||
y_ptr += nb1 * inc_y * 4;
|
||||
a_ptr += nb1 * lda4 ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
yp = ytemp;
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
sgemv_kernel_4x4(NB,ap,xbuffer,yp);
|
||||
ap[0] += lda4 ;
|
||||
ap[1] += lda4 ;
|
||||
ap[2] += lda4 ;
|
||||
ap[3] += lda4 ;
|
||||
yp += 4;
|
||||
}
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
add_y(n1*4, alpha, ytemp, y_ptr, inc_y );
|
||||
y_ptr += n1 * inc_y * 4;
|
||||
a_ptr += n1 * lda4 ;
|
||||
}
|
||||
|
||||
if ( n2 & 2 )
|
||||
{
|
||||
|
||||
sgemv_kernel_4x2(NB,ap,xbuffer,ybuffer);
|
||||
a_ptr += lda * 2;
|
||||
*y_ptr += ybuffer[0] * alpha;
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[1] * alpha;
|
||||
y_ptr += inc_y;
|
||||
|
||||
}
|
||||
|
||||
if ( n2 & 1 )
|
||||
{
|
||||
|
||||
sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
|
||||
a_ptr += lda;
|
||||
*y_ptr += ybuffer[0] * alpha;
|
||||
y_ptr += inc_y;
|
||||
|
||||
}
|
||||
a += NB;
|
||||
x += NB * inc_x;
|
||||
}
|
||||
|
||||
if ( m3 == 0 ) return(0);
|
||||
|
||||
x_ptr = x;
|
||||
a_ptr = a;
|
||||
if ( m3 == 3 )
|
||||
{
|
||||
FLOAT xtemp0 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp1 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp2 = *x_ptr * alpha;
|
||||
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
|
||||
if ( lda == 3 && inc_y == 1 )
|
||||
{
|
||||
|
||||
for ( j=0; j< ( n & -4) ; j+=4 )
|
||||
{
|
||||
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
|
||||
y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
|
||||
y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
|
||||
y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
|
||||
aj += 12;
|
||||
}
|
||||
|
||||
for ( ; j<n; j++ )
|
||||
{
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
|
||||
aj += 3;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
if ( inc_y == 1 )
|
||||
{
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
|
||||
for ( j=0; j< ( n & -4 ); j+=4 )
|
||||
{
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
|
||||
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 + *(aj+lda+2) * xtemp2;
|
||||
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2;
|
||||
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2;
|
||||
aj += lda4;
|
||||
}
|
||||
|
||||
for ( ; j< n ; j++ )
|
||||
{
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for ( j=0; j<n; j++ )
|
||||
{
|
||||
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
if ( m3 == 2 )
|
||||
{
|
||||
FLOAT xtemp0 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp1 = *x_ptr * alpha;
|
||||
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
|
||||
if ( lda == 2 && inc_y == 1 )
|
||||
{
|
||||
|
||||
for ( j=0; j< ( n & -4) ; j+=4 )
|
||||
{
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
|
||||
y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ;
|
||||
y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ;
|
||||
y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ;
|
||||
aj += 8;
|
||||
|
||||
}
|
||||
|
||||
for ( ; j<n; j++ )
|
||||
{
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
|
||||
aj += 2;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( inc_y == 1 )
|
||||
{
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
|
||||
for ( j=0; j< ( n & -4 ); j+=4 )
|
||||
{
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
|
||||
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 ;
|
||||
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ;
|
||||
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ;
|
||||
aj += lda4;
|
||||
}
|
||||
|
||||
for ( ; j< n ; j++ )
|
||||
{
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j=0; j<n; j++ )
|
||||
{
|
||||
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
FLOAT xtemp = *x_ptr * alpha;
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
if ( lda == 1 && inc_y == 1 )
|
||||
{
|
||||
for ( j=0; j< ( n & -4) ; j+=4 )
|
||||
{
|
||||
y_ptr[j] += aj[j] * xtemp;
|
||||
y_ptr[j+1] += aj[j+1] * xtemp;
|
||||
y_ptr[j+2] += aj[j+2] * xtemp;
|
||||
y_ptr[j+3] += aj[j+3] * xtemp;
|
||||
}
|
||||
for ( ; j<n ; j++ )
|
||||
{
|
||||
y_ptr[j] += aj[j] * xtemp;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( inc_y == 1 )
|
||||
{
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
for ( j=0; j< ( n & -4 ); j+=4 )
|
||||
{
|
||||
y_ptr[j] += *aj * xtemp;
|
||||
y_ptr[j+1] += *(aj+lda) * xtemp;
|
||||
y_ptr[j+2] += *(aj+lda2) * xtemp;
|
||||
y_ptr[j+3] += *(aj+lda3) * xtemp;
|
||||
aj += lda4 ;
|
||||
}
|
||||
|
||||
for ( ; j<n; j++ )
|
||||
{
|
||||
y_ptr[j] += *aj * xtemp;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j=0; j<n; j++ )
|
||||
{
|
||||
*y_ptr += *aj * xtemp;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,186 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT max;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%2) \n\t"
|
||||
"srlg %%r0,%1,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vfchsb %%v24,%%v16,%%v17 \n\t"
|
||||
"vfchsb %%v25,%%v18,%%v19 \n\t"
|
||||
"vfchsb %%v26,%%v20,%%v21 \n\t"
|
||||
"vfchsb %%v27,%%v22,%%v23 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchsb %%v28,%%v24,%%v25 \n\t"
|
||||
"vfchsb %%v29,%%v26,%%v27 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchsb %%v30,%%v28,%%v29 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchsb %%v31,%%v30,%%v0 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%2) \n\t"
|
||||
"vl %%v17,144(%%r1,%2) \n\t"
|
||||
"vl %%v18,160(%%r1,%2) \n\t"
|
||||
"vl %%v19,176(%%r1,%2) \n\t"
|
||||
"vl %%v20,192(%%r1,%2) \n\t"
|
||||
"vl %%v21,208(%%r1,%2) \n\t"
|
||||
"vl %%v22,224(%%r1,%2) \n\t"
|
||||
"vl %%v23,240(%%r1,%2) \n\t"
|
||||
|
||||
"vfchsb %%v24,%%v16,%%v17 \n\t"
|
||||
"vfchsb %%v25,%%v18,%%v19 \n\t"
|
||||
"vfchsb %%v26,%%v20,%%v21 \n\t"
|
||||
"vfchsb %%v27,%%v22,%%v23 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchsb %%v28,%%v24,%%v25 \n\t"
|
||||
"vfchsb %%v29,%%v26,%%v27 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchsb %%v30,%%v28,%%v29 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchsb %%v31,%%v30,%%v0 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v16,%%v0,32 \n\t"
|
||||
"vfchsb %%v17,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
|
||||
|
||||
"vrepf %%v16,%%v0,2 \n\t"
|
||||
"wfchsb %%v17,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
|
||||
"ler %0,%%f0 "
|
||||
:"=f"(max)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return max;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if (n1 > 0) {
|
||||
|
||||
maxf = smax_kernel_64(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
maxf=x[0];
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (x[i] > maxf) {
|
||||
maxf = x[i];
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (maxf);
|
||||
|
||||
} else {
|
||||
|
||||
maxf=x[0];
|
||||
i += inc_x;
|
||||
j++;
|
||||
|
||||
BLASLONG n1 = (n - 1) & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (x[i] > maxf) {
|
||||
maxf = x[i];
|
||||
}
|
||||
if (x[i + inc_x] > maxf) {
|
||||
maxf = x[i + inc_x];
|
||||
}
|
||||
if (x[i + 2 * inc_x] > maxf) {
|
||||
maxf = x[i + 2 * inc_x];
|
||||
}
|
||||
if (x[i + 3 * inc_x] > maxf) {
|
||||
maxf = x[i + 3 * inc_x];
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (x[i] > maxf) {
|
||||
maxf = x[i];
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (maxf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,186 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT min;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%2) \n\t"
|
||||
"srlg %%r0,%1,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vfchsb %%v24,%%v17,%%v16 \n\t"
|
||||
"vfchsb %%v25,%%v19,%%v18 \n\t"
|
||||
"vfchsb %%v26,%%v21,%%v20 \n\t"
|
||||
"vfchsb %%v27,%%v23,%%v22 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchsb %%v28,%%v25,%%v24 \n\t"
|
||||
"vfchsb %%v29,%%v27,%%v26 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchsb %%v30,%%v29,%%v28 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchsb %%v31,%%v0,%%v30 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%2) \n\t"
|
||||
"vl %%v17,144(%%r1,%2) \n\t"
|
||||
"vl %%v18,160(%%r1,%2) \n\t"
|
||||
"vl %%v19,176(%%r1,%2) \n\t"
|
||||
"vl %%v20,192(%%r1,%2) \n\t"
|
||||
"vl %%v21,208(%%r1,%2) \n\t"
|
||||
"vl %%v22,224(%%r1,%2) \n\t"
|
||||
"vl %%v23,240(%%r1,%2) \n\t"
|
||||
|
||||
"vfchsb %%v24,%%v17,%%v16 \n\t"
|
||||
"vfchsb %%v25,%%v19,%%v18 \n\t"
|
||||
"vfchsb %%v26,%%v21,%%v20 \n\t"
|
||||
"vfchsb %%v27,%%v23,%%v22 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchsb %%v28,%%v25,%%v24 \n\t"
|
||||
"vfchsb %%v29,%%v27,%%v26 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchsb %%v30,%%v29,%%v28 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchsb %%v31,%%v0,%%v30 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v16,%%v0,32 \n\t"
|
||||
"vfchsb %%v17,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
|
||||
|
||||
"vrepf %%v16,%%v0,2 \n\t"
|
||||
"wfchsb %%v17,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
|
||||
"ler %0,%%f0 "
|
||||
:"=f"(min)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return min;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT minf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (minf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if (n1 > 0) {
|
||||
|
||||
minf = smin_kernel_64(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
minf=x[0];
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (x[i] < minf) {
|
||||
minf = x[i];
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (minf);
|
||||
|
||||
} else {
|
||||
|
||||
minf=x[0];
|
||||
i += inc_x;
|
||||
j++;
|
||||
|
||||
BLASLONG n1 = (n - 1) & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (x[i] < minf) {
|
||||
minf = x[i];
|
||||
}
|
||||
if (x[i + inc_x] < minf) {
|
||||
minf = x[i + inc_x];
|
||||
}
|
||||
if (x[i + 2 * inc_x] < minf) {
|
||||
minf = x[i + 2 * inc_x];
|
||||
}
|
||||
if (x[i + 3 * inc_x] < minf) {
|
||||
minf = x[i + 3 * inc_x];
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (x[i] < minf) {
|
||||
minf = x[i];
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (minf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,246 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||
{
|
||||
__asm__ (
|
||||
"vlrepf %%v0,%3 \n\t"
|
||||
"vlrepf %%v1,%4 \n\t"
|
||||
"srlg %%r0,%0,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
"vl %%v24, 0(%%r1,%1) \n\t"
|
||||
"vl %%v25, 16(%%r1,%1) \n\t"
|
||||
"vl %%v26, 32(%%r1,%1) \n\t"
|
||||
"vl %%v27, 48(%%r1,%1) \n\t"
|
||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
||||
|
||||
"vfmsb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmsb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 0(%%r1,%1) \n\t"
|
||||
"vst %%v29, 16(%%r1,%1) \n\t"
|
||||
"vst %%v30, 32(%%r1,%1) \n\t"
|
||||
"vst %%v31, 48(%%r1,%1) \n\t"
|
||||
"vst %%v20, 0(%%r1,%2) \n\t"
|
||||
"vst %%v21, 16(%%r1,%2) \n\t"
|
||||
"vst %%v22, 32(%%r1,%2) \n\t"
|
||||
"vst %%v23, 48(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 64(%%r1,%1) \n\t"
|
||||
"vl %%v25, 80(%%r1,%1) \n\t"
|
||||
"vl %%v26, 96(%%r1,%1) \n\t"
|
||||
"vl %%v27, 112(%%r1,%1) \n\t"
|
||||
"vl %%v16, 64(%%r1,%2) \n\t"
|
||||
"vl %%v17, 80(%%r1,%2) \n\t"
|
||||
"vl %%v18, 96(%%r1,%2) \n\t"
|
||||
"vl %%v19, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vfmsb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmsb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 64(%%r1,%1) \n\t"
|
||||
"vst %%v29, 80(%%r1,%1) \n\t"
|
||||
"vst %%v30, 96(%%r1,%1) \n\t"
|
||||
"vst %%v31, 112(%%r1,%1) \n\t"
|
||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
||||
|
||||
"vfmsb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmsb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 128(%%r1,%1) \n\t"
|
||||
"vst %%v29, 144(%%r1,%1) \n\t"
|
||||
"vst %%v30, 160(%%r1,%1) \n\t"
|
||||
"vst %%v31, 176(%%r1,%1) \n\t"
|
||||
"vst %%v20, 128(%%r1,%2) \n\t"
|
||||
"vst %%v21, 144(%%r1,%2) \n\t"
|
||||
"vst %%v22, 160(%%r1,%2) \n\t"
|
||||
"vst %%v23, 176(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 192(%%r1,%1) \n\t"
|
||||
"vl %%v25, 208(%%r1,%1) \n\t"
|
||||
"vl %%v26, 224(%%r1,%1) \n\t"
|
||||
"vl %%v27, 240(%%r1,%1) \n\t"
|
||||
"vl %%v16, 192(%%r1,%2) \n\t"
|
||||
"vl %%v17, 208(%%r1,%2) \n\t"
|
||||
"vl %%v18, 224(%%r1,%2) \n\t"
|
||||
"vl %%v19, 240(%%r1,%2) \n\t"
|
||||
|
||||
"vfmsb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmsb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 192(%%r1,%1) \n\t"
|
||||
"vst %%v29, 208(%%r1,%1) \n\t"
|
||||
"vst %%v30, 224(%%r1,%1) \n\t"
|
||||
"vst %%v31, 240(%%r1,%1) \n\t"
|
||||
"vst %%v20, 192(%%r1,%2) \n\t"
|
||||
"vst %%v21, 208(%%r1,%2) \n\t"
|
||||
"vst %%v22, 224(%%r1,%2) \n\t"
|
||||
"vst %%v23, 240(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
FLOAT temp;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
FLOAT cosa,sina;
|
||||
cosa=c;
|
||||
sina=s;
|
||||
srot_kernel_64(n1, x, y, &cosa, &sina);
|
||||
i=n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp = c*x[i] + s*y[i] ;
|
||||
y[i] = c*y[i] - s*x[i] ;
|
||||
x[i] = temp ;
|
||||
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp = c*x[ix] + s*y[iy] ;
|
||||
y[iy] = c*y[iy] - s*x[ix] ;
|
||||
x[ix] = temp ;
|
||||
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,201 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vlrepf %%v0,%1 \n\t"
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
"vl %%v24, 0(%%r1,%2) \n\t"
|
||||
"vfmsb %%v24,%%v24,%%v0 \n\t"
|
||||
"vst %%v24, 0(%%r1,%2) \n\t"
|
||||
"vl %%v25, 16(%%r1,%2) \n\t"
|
||||
"vfmsb %%v25,%%v25,%%v0 \n\t"
|
||||
"vst %%v25, 16(%%r1,%2) \n\t"
|
||||
"vl %%v26, 32(%%r1,%2) \n\t"
|
||||
"vfmsb %%v26,%%v26,%%v0 \n\t"
|
||||
"vst %%v26, 32(%%r1,%2) \n\t"
|
||||
"vl %%v27, 48(%%r1,%2) \n\t"
|
||||
"vfmsb %%v27,%%v27,%%v0 \n\t"
|
||||
"vst %%v27, 48(%%r1,%2) \n\t"
|
||||
"vl %%v24, 64(%%r1,%2) \n\t"
|
||||
"vfmsb %%v24,%%v24,%%v0 \n\t"
|
||||
"vst %%v24, 64(%%r1,%2) \n\t"
|
||||
"vl %%v25, 80(%%r1,%2) \n\t"
|
||||
"vfmsb %%v25,%%v25,%%v0 \n\t"
|
||||
"vst %%v25, 80(%%r1,%2) \n\t"
|
||||
"vl %%v26, 96(%%r1,%2) \n\t"
|
||||
"vfmsb %%v26,%%v26,%%v0 \n\t"
|
||||
"vst %%v26, 96(%%r1,%2) \n\t"
|
||||
"vl %%v27, 112(%%r1,%2) \n\t"
|
||||
"vfmsb %%v27,%%v27,%%v0 \n\t"
|
||||
"vst %%v27, 112(%%r1,%2) \n\t"
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v24","v25","v26","v27"
|
||||
);
|
||||
}
|
||||
|
||||
static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vzero %%v24 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"vzero %%v26 \n\t"
|
||||
"vzero %%v27 \n\t"
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
|
||||
"vst %%v24,0(%%r1,%1) \n\t"
|
||||
"vst %%v25,16(%%r1,%1) \n\t"
|
||||
"vst %%v26,32(%%r1,%1) \n\t"
|
||||
"vst %%v27,48(%%r1,%1) \n\t"
|
||||
"vst %%v24,64(%%r1,%1) \n\t"
|
||||
"vst %%v25,80(%%r1,%1) \n\t"
|
||||
"vst %%v26,96(%%r1,%1) \n\t"
|
||||
"vst %%v27,112(%%r1,%1) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v24","v25","v26","v27"
|
||||
);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0,j=0;
|
||||
if ( n <= 0 || inc_x <=0 )
|
||||
return(0);
|
||||
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
if ( da == 0.0 )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
|
||||
sscal_kernel_32_zero(n1, x);
|
||||
j=n1;
|
||||
}
|
||||
|
||||
while(j < n)
|
||||
{
|
||||
|
||||
x[j]=0.0;
|
||||
j++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
sscal_kernel_32(n1, da, x);
|
||||
j=n1;
|
||||
}
|
||||
while(j < n)
|
||||
{
|
||||
|
||||
x[j] = da * x[j] ;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
if ( da == 0.0 )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -2;
|
||||
|
||||
while (j < n1) {
|
||||
|
||||
x[i]=0.0;
|
||||
x[i + inc_x]=0.0;
|
||||
|
||||
i += inc_x * 2;
|
||||
j += 2;
|
||||
|
||||
}
|
||||
while(j < n)
|
||||
{
|
||||
|
||||
x[i]=0.0;
|
||||
i += inc_x ;
|
||||
j++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
BLASLONG n1 = n & -2;
|
||||
|
||||
while (j < n1) {
|
||||
|
||||
x[i] = da * x[i] ;
|
||||
x[i + inc_x] = da * x[i + inc_x];
|
||||
|
||||
i += inc_x * 2;
|
||||
j += 2;
|
||||
|
||||
}
|
||||
|
||||
while(j < n)
|
||||
{
|
||||
|
||||
x[i] = da * x[i] ;
|
||||
i += inc_x ;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,164 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"srlg %%r0,%0,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16, 0(%%r1,%1) \n\t"
|
||||
"vl %%v17, 16(%%r1,%1) \n\t"
|
||||
"vl %%v18, 32(%%r1,%1) \n\t"
|
||||
"vl %%v19, 48(%%r1,%1) \n\t"
|
||||
"vl %%v20, 64(%%r1,%1) \n\t"
|
||||
"vl %%v21, 80(%%r1,%1) \n\t"
|
||||
"vl %%v22, 96(%%r1,%1) \n\t"
|
||||
"vl %%v23, 112(%%r1,%1) \n\t"
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vl %%v28, 192(%%r1,%1) \n\t"
|
||||
"vl %%v29, 208(%%r1,%1) \n\t"
|
||||
"vl %%v30, 224(%%r1,%1) \n\t"
|
||||
"vl %%v31, 240(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v0, 0(%%r1,%2) \n\t"
|
||||
"vl %%v1, 16(%%r1,%2) \n\t"
|
||||
"vl %%v2, 32(%%r1,%2) \n\t"
|
||||
"vl %%v3, 48(%%r1,%2) \n\t"
|
||||
"vl %%v4, 64(%%r1,%2) \n\t"
|
||||
"vl %%v5, 80(%%r1,%2) \n\t"
|
||||
"vl %%v6, 96(%%r1,%2) \n\t"
|
||||
"vl %%v7, 112(%%r1,%2) \n\t"
|
||||
"vst %%v0, 0(%%r1,%1) \n\t"
|
||||
"vst %%v1, 16(%%r1,%1) \n\t"
|
||||
"vst %%v2, 32(%%r1,%1) \n\t"
|
||||
"vst %%v3, 48(%%r1,%1) \n\t"
|
||||
"vst %%v4, 64(%%r1,%1) \n\t"
|
||||
"vst %%v5, 80(%%r1,%1) \n\t"
|
||||
"vst %%v6, 96(%%r1,%1) \n\t"
|
||||
"vst %%v7, 112(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v0, 128(%%r1,%2) \n\t"
|
||||
"vl %%v1, 144(%%r1,%2) \n\t"
|
||||
"vl %%v2, 160(%%r1,%2) \n\t"
|
||||
"vl %%v3, 176(%%r1,%2) \n\t"
|
||||
"vl %%v4, 192(%%r1,%2) \n\t"
|
||||
"vl %%v5, 208(%%r1,%2) \n\t"
|
||||
"vl %%v6, 224(%%r1,%2) \n\t"
|
||||
"vl %%v7, 240(%%r1,%2) \n\t"
|
||||
"vst %%v0, 128(%%r1,%1) \n\t"
|
||||
"vst %%v1, 144(%%r1,%1) \n\t"
|
||||
"vst %%v2, 160(%%r1,%1) \n\t"
|
||||
"vst %%v3, 176(%%r1,%1) \n\t"
|
||||
"vst %%v4, 192(%%r1,%1) \n\t"
|
||||
"vst %%v5, 208(%%r1,%1) \n\t"
|
||||
"vst %%v6, 224(%%r1,%1) \n\t"
|
||||
"vst %%v7, 240(%%r1,%1) \n\t"
|
||||
|
||||
"vst %%v16, 0(%%r1,%2) \n\t"
|
||||
"vst %%v17, 16(%%r1,%2) \n\t"
|
||||
"vst %%v18, 32(%%r1,%2) \n\t"
|
||||
"vst %%v19, 48(%%r1,%2) \n\t"
|
||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
||||
"vst %%v24, 128(%%r1,%2) \n\t"
|
||||
"vst %%v25, 144(%%r1,%2) \n\t"
|
||||
"vst %%v26, 160(%%r1,%2) \n\t"
|
||||
"vst %%v27, 176(%%r1,%2) \n\t"
|
||||
"vst %%v28, 192(%%r1,%2) \n\t"
|
||||
"vst %%v29, 208(%%r1,%2) \n\t"
|
||||
"vst %%v30, 224(%%r1,%2) \n\t"
|
||||
"vst %%v31, 240(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT temp;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
sswap_kernel_64(n1, x, y);
|
||||
i=n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp = y[i];
|
||||
y[i] = x[i] ;
|
||||
x[i] = temp;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp = y[iy];
|
||||
y[iy] = x[ix] ;
|
||||
x[ix] = temp;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,221 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
||||
|
||||
static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT amax;
|
||||
|
||||
__asm__ volatile (
|
||||
"vleg %%v0,0(%2),0 \n\t"
|
||||
"vleg %%v16,8(%2),0 \n\t"
|
||||
"vleg %%v0,16(%2),1 \n\t"
|
||||
"vleg %%v16,24(%2),1 \n\t"
|
||||
"vflpdb %%v0,%%v0 \n\t"
|
||||
"vflpdb %%v16,%%v16 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v16 \n\t"
|
||||
"srlg %%r0,%1,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vleg %%v16,0(%%r1,%2),0 \n\t"
|
||||
"vleg %%v17,8(%%r1,%2),0 \n\t"
|
||||
"vleg %%v16,16(%%r1,%2),1 \n\t"
|
||||
"vleg %%v17,24(%%r1,%2),1 \n\t"
|
||||
"vleg %%v18,32(%%r1,%2),0 \n\t"
|
||||
"vleg %%v19,40(%%r1,%2),0 \n\t"
|
||||
"vleg %%v18,48(%%r1,%2),1 \n\t"
|
||||
"vleg %%v19,56(%%r1,%2),1 \n\t"
|
||||
"vleg %%v20,64(%%r1,%2),0 \n\t"
|
||||
"vleg %%v21,72(%%r1,%2),0 \n\t"
|
||||
"vleg %%v20,80(%%r1,%2),1 \n\t"
|
||||
"vleg %%v21,88(%%r1,%2),1 \n\t"
|
||||
"vleg %%v22,96(%%r1,%2),0 \n\t"
|
||||
"vleg %%v23,104(%%r1,%2),0 \n\t"
|
||||
"vleg %%v22,112(%%r1,%2),1 \n\t"
|
||||
"vleg %%v23,120(%%r1,%2),1 \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
"vfadb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfadb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfadb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfadb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v16,%%v17 \n\t"
|
||||
"vfchdb %%v25,%%v18,%%v19 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
|
||||
"vfchdb %%v26,%%v24,%%v25 \n\t"
|
||||
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
|
||||
|
||||
"vfchdb %%v27,%%v26,%%v0 \n\t"
|
||||
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
|
||||
|
||||
"vleg %%v16,128(%%r1,%2),0 \n\t"
|
||||
"vleg %%v17,136(%%r1,%2),0 \n\t"
|
||||
"vleg %%v16,144(%%r1,%2),1 \n\t"
|
||||
"vleg %%v17,152(%%r1,%2),1 \n\t"
|
||||
"vleg %%v18,160(%%r1,%2),0 \n\t"
|
||||
"vleg %%v19,168(%%r1,%2),0 \n\t"
|
||||
"vleg %%v18,176(%%r1,%2),1 \n\t"
|
||||
"vleg %%v19,184(%%r1,%2),1 \n\t"
|
||||
"vleg %%v20,192(%%r1,%2),0 \n\t"
|
||||
"vleg %%v21,200(%%r1,%2),0 \n\t"
|
||||
"vleg %%v20,208(%%r1,%2),1 \n\t"
|
||||
"vleg %%v21,216(%%r1,%2),1 \n\t"
|
||||
"vleg %%v22,224(%%r1,%2),0 \n\t"
|
||||
"vleg %%v23,232(%%r1,%2),0 \n\t"
|
||||
"vleg %%v22,240(%%r1,%2),1 \n\t"
|
||||
"vleg %%v23,248(%%r1,%2),1 \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
"vfadb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfadb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfadb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfadb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v16,%%v17 \n\t"
|
||||
"vfchdb %%v25,%%v18,%%v19 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
|
||||
"vfchdb %%v26,%%v24,%%v25 \n\t"
|
||||
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
|
||||
|
||||
"vfchdb %%v27,%%v26,%%v0 \n\t"
|
||||
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v16,%%v0,1 \n\t"
|
||||
"wfchdb %%v17,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(amax)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
|
||||
);
|
||||
|
||||
return amax;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
maxf = zamax_kernel_16(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
maxf=CABS1(x,0);
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i*2]) > maxf) {
|
||||
maxf = ABS(x[i*2]);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (maxf);
|
||||
|
||||
} else {
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
maxf=CABS1(x,0);
|
||||
i += inc_x2;
|
||||
j++;
|
||||
|
||||
BLASLONG n1 = (n - 1) & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (CABS1(x,i) > maxf) {
|
||||
maxf = CABS1(x,i);
|
||||
}
|
||||
if (CABS1(x,i+inc_x2) > maxf) {
|
||||
maxf = CABS1(x,i+inc_x2);
|
||||
}
|
||||
if (CABS1(x,i+inc_x2*2) > maxf) {
|
||||
maxf = CABS1(x,i+inc_x2*2);
|
||||
}
|
||||
if (CABS1(x,i+inc_x2*3) > maxf) {
|
||||
maxf = CABS1(x,i+inc_x2*3);
|
||||
}
|
||||
|
||||
i += inc_x2 * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (CABS1(x,i) > maxf) {
|
||||
maxf = CABS1(x,i);
|
||||
}
|
||||
i += inc_x2;
|
||||
j++;
|
||||
}
|
||||
return (maxf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,221 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
||||
|
||||
static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT amin;
|
||||
|
||||
__asm__ volatile (
|
||||
"vleg %%v0,0(%2),0 \n\t"
|
||||
"vleg %%v16,8(%2),0 \n\t"
|
||||
"vleg %%v0,16(%2),1 \n\t"
|
||||
"vleg %%v16,24(%2),1 \n\t"
|
||||
"vflpdb %%v0,%%v0 \n\t"
|
||||
"vflpdb %%v16,%%v16 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v16 \n\t"
|
||||
"srlg %%r0,%1,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vleg %%v16,0(%%r1,%2),0 \n\t"
|
||||
"vleg %%v17,8(%%r1,%2),0 \n\t"
|
||||
"vleg %%v16,16(%%r1,%2),1 \n\t"
|
||||
"vleg %%v17,24(%%r1,%2),1 \n\t"
|
||||
"vleg %%v18,32(%%r1,%2),0 \n\t"
|
||||
"vleg %%v19,40(%%r1,%2),0 \n\t"
|
||||
"vleg %%v18,48(%%r1,%2),1 \n\t"
|
||||
"vleg %%v19,56(%%r1,%2),1 \n\t"
|
||||
"vleg %%v20,64(%%r1,%2),0 \n\t"
|
||||
"vleg %%v21,72(%%r1,%2),0 \n\t"
|
||||
"vleg %%v20,80(%%r1,%2),1 \n\t"
|
||||
"vleg %%v21,88(%%r1,%2),1 \n\t"
|
||||
"vleg %%v22,96(%%r1,%2),0 \n\t"
|
||||
"vleg %%v23,104(%%r1,%2),0 \n\t"
|
||||
"vleg %%v22,112(%%r1,%2),1 \n\t"
|
||||
"vleg %%v23,120(%%r1,%2),1 \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
"vfadb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfadb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfadb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfadb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v17,%%v16 \n\t"
|
||||
"vfchdb %%v25,%%v19,%%v18 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
|
||||
"vfchdb %%v26,%%v25,%%v24 \n\t"
|
||||
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
|
||||
|
||||
"vfchdb %%v27,%%v0,%%v26 \n\t"
|
||||
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
|
||||
|
||||
"vleg %%v16,128(%%r1,%2),0 \n\t"
|
||||
"vleg %%v17,136(%%r1,%2),0 \n\t"
|
||||
"vleg %%v16,144(%%r1,%2),1 \n\t"
|
||||
"vleg %%v17,152(%%r1,%2),1 \n\t"
|
||||
"vleg %%v18,160(%%r1,%2),0 \n\t"
|
||||
"vleg %%v19,168(%%r1,%2),0 \n\t"
|
||||
"vleg %%v18,176(%%r1,%2),1 \n\t"
|
||||
"vleg %%v19,184(%%r1,%2),1 \n\t"
|
||||
"vleg %%v20,192(%%r1,%2),0 \n\t"
|
||||
"vleg %%v21,200(%%r1,%2),0 \n\t"
|
||||
"vleg %%v20,208(%%r1,%2),1 \n\t"
|
||||
"vleg %%v21,216(%%r1,%2),1 \n\t"
|
||||
"vleg %%v22,224(%%r1,%2),0 \n\t"
|
||||
"vleg %%v23,232(%%r1,%2),0 \n\t"
|
||||
"vleg %%v22,240(%%r1,%2),1 \n\t"
|
||||
"vleg %%v23,248(%%r1,%2),1 \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
"vfadb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfadb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfadb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfadb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v17,%%v16 \n\t"
|
||||
"vfchdb %%v25,%%v19,%%v18 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
|
||||
"vfchdb %%v26,%%v25,%%v24 \n\t"
|
||||
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
|
||||
|
||||
"vfchdb %%v27,%%v0,%%v26 \n\t"
|
||||
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v16,%%v0,1 \n\t"
|
||||
"wfchdb %%v17,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(amin)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
|
||||
);
|
||||
|
||||
return amin;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT minf = 0.0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (minf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
minf = zamin_kernel_16(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
minf=CABS1(x,0);
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i*2]) < minf) {
|
||||
minf = ABS(x[i*2]);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (minf);
|
||||
|
||||
} else {
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
minf=CABS1(x,0);
|
||||
i += inc_x2;
|
||||
j++;
|
||||
|
||||
BLASLONG n1 = (n - 1) & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (CABS1(x,i) < minf) {
|
||||
minf = CABS1(x,i);
|
||||
}
|
||||
if (CABS1(x,i+inc_x2) < minf) {
|
||||
minf = CABS1(x,i+inc_x2);
|
||||
}
|
||||
if (CABS1(x,i+inc_x2*2) < minf) {
|
||||
minf = CABS1(x,i+inc_x2*2);
|
||||
}
|
||||
if (CABS1(x,i+inc_x2*3) < minf) {
|
||||
minf = CABS1(x,i+inc_x2*3);
|
||||
}
|
||||
|
||||
i += inc_x2 * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (CABS1(x,i) < minf) {
|
||||
minf = CABS1(x,i);
|
||||
}
|
||||
i += inc_x2;
|
||||
j++;
|
||||
}
|
||||
return (minf);
|
||||
}
|
||||
}
|
||||
|
|
@ -25,92 +25,98 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) {
|
||||
|
||||
static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT asum;
|
||||
|
||||
__asm__ (
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"sllg %%r0,%[n],4 \n\t"
|
||||
"agr %%r0,%[ptr_x] \n\t"
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"vzero %%v22 \n\t"
|
||||
"vzero %%v23 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%[ptr_tmp] ) \n\t"
|
||||
"vlm %%v24,%%v31,0(%[ptr_tmp]) \n\t"
|
||||
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
|
||||
"vfadb %%v0,%%v0,%%v24 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v25 \n\t"
|
||||
"vfadb %%v23,%%v23,%%v26 \n\t"
|
||||
"vfadb %%v22,%%v22,%%v27 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v28 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v29 \n\t"
|
||||
"vfadb %%v23,%%v23,%%v30 \n\t"
|
||||
"vfadb %%v22,%%v22,%%v31 \n\t"
|
||||
|
||||
"vlm %%v24,%%v31, 128(%[ptr_tmp]) \n\t"
|
||||
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
|
||||
"vfadb %%v0,%%v0,%%v24 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v25 \n\t"
|
||||
"vfadb %%v23,%%v23,%%v26 \n\t"
|
||||
"vfadb %%v22,%%v22,%%v27 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v28 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v29 \n\t"
|
||||
"vfadb %%v23,%%v23,%%v30 \n\t"
|
||||
"vfadb %%v22,%%v22,%%v31 \n\t"
|
||||
|
||||
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
|
||||
"vfadb %%v24,%%v0,%%v1 \n\t"
|
||||
"vfadb %%v25,%%v23,%%v22 \n\t"
|
||||
"vfadb %%v0,%%v25,%%v24 \n\t"
|
||||
"vrepg %%v1,%%v0,1 \n\t"
|
||||
"adbr %%f0,%%f1 \n\t"
|
||||
"ldr %[asum] ,%%f0"
|
||||
: [asum] "=f"(asum),[ptr_tmp] "+&a"(x)
|
||||
: [mem] "m"( *(const double (*)[2*n])x ), [n] "r"(n), [ptr_x] "a"(x)
|
||||
: "cc", "r0","f0","f1","v0","v1","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"vzero %%v2 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"srlg %%r0,%1,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
||||
"vl %%v20, 64(%%r1,%2) \n\t"
|
||||
"vl %%v21, 80(%%r1,%2) \n\t"
|
||||
"vl %%v22, 96(%%r1,%2) \n\t"
|
||||
"vl %%v23, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfadb %%v0,%%v0,%%v16 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v17 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v18 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v19 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v20 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v21 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v22 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v23 \n\t"
|
||||
|
||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
||||
"vl %%v20, 192(%%r1,%2) \n\t"
|
||||
"vl %%v21, 208(%%r1,%2) \n\t"
|
||||
"vl %%v22, 224(%%r1,%2) \n\t"
|
||||
"vl %%v23, 240(%%r1,%2) \n\t"
|
||||
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfadb %%v0,%%v0,%%v16 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v17 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v18 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v19 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v20 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v21 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v22 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v23 \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
"vfadb %%v0,%%v0,%%v1 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v2 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v3 \n\t"
|
||||
"vrepg %%v1,%%v0,1 \n\t"
|
||||
"adbr %%f0,%%f1 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(asum)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
);
|
||||
|
||||
return asum;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
|
|
@ -128,7 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
if ( n1 > 0 )
|
||||
{
|
||||
|
||||
sumf=zasum_kernel_16(n1, x );
|
||||
sumf = zasum_kernel_16(n1, x);
|
||||
i=n1;
|
||||
ip=2*n1;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,142 +23,98 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) {
|
||||
|
||||
BLASLONG tempR1 ;
|
||||
__asm__ ("pfd 1, 0(%[x_tmp]) \n\t"
|
||||
"pfd 2, 0(%[y_tmp]) \n\t"
|
||||
static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
__asm__ volatile(
|
||||
#if !defined(CONJ)
|
||||
"lgdr %[t1],%[alpha_r] \n\t"
|
||||
"vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint
|
||||
"lgdr %[t1],%[alpha_i] \n\t"
|
||||
"vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint
|
||||
"vflcdb %%v29,%%v29 \n\t" //complement both
|
||||
"vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i}
|
||||
"vlrepg %%v0,0(%3) \n\t"
|
||||
"vleg %%v1,8(%3),0 \n\t"
|
||||
"wflcdb %%v1,%%v1 \n\t"
|
||||
"vleg %%v1,8(%3),1 \n\t"
|
||||
#else
|
||||
"vleg %%v0,0(%3),1 \n\t"
|
||||
"vflcdb %%v0,%%v0 \n\t"
|
||||
"vleg %%v0,0(%3),0 \n\t"
|
||||
"vlrepg %%v1,8(%3) \n\t"
|
||||
#endif
|
||||
"srlg %%r0,%0,3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
#else
|
||||
"lgdr %[t1],%[alpha_i] \n\t"
|
||||
"vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint
|
||||
"lgdr %[t1],%[alpha_r] \n\t"
|
||||
"vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint
|
||||
"vflcdb %%v28,%%v28 \n\t" //complement both
|
||||
"vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r}
|
||||
#endif
|
||||
|
||||
"xgr %[t1],%[t1] \n\t"
|
||||
"sllg %[tmp],%[tmp],4 \n\t"
|
||||
"vl %%v30 , 0(%[t1],%[y_tmp]) \n\t"
|
||||
"vl %%v31 , 16(%[t1],%[y_tmp]) \n\t"
|
||||
"vl %%v6 , 32(%[t1],%[y_tmp]) \n\t"
|
||||
"vl %%v7 , 48(%[t1],%[y_tmp]) \n\t"
|
||||
"vl %%v20 , 0(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v21 , 16(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v22 , 32(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v23 , 48(%[t1],%[x_tmp]) \n\t"
|
||||
"lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition
|
||||
"j 2f \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
|
||||
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
|
||||
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
|
||||
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
|
||||
"vfmadb %%v16, %%v20, %%v28, %%v16 \n\t"
|
||||
"vfmadb %%v17, %%v21, %%v28, %%v17 \n\t"
|
||||
"vfmadb %%v18, %%v22, %%v28, %%v18 \n\t"
|
||||
"vfmadb %%v19, %%v23, %%v28, %%v19 \n\t"
|
||||
"vl %%v30, 64(%[t1],%[y_tmp]) \n\t"
|
||||
"vl %%v31, 80(%[t1],%[y_tmp]) \n\t"
|
||||
"vl %%v6 , 96(%[t1],%[y_tmp]) \n\t"
|
||||
"vl %%v7 , 112(%[t1],%[y_tmp]) \n\t"
|
||||
"vfmadb %%v16, %%v24, %%v29, %%v16 \n\t"
|
||||
"vfmadb %%v17, %%v25, %%v29, %%v17 \n\t"
|
||||
"vfmadb %%v18, %%v26, %%v29, %%v18 \n\t"
|
||||
"vfmadb %%v19, %%v27, %%v29, %%v19 \n\t"
|
||||
"vl %%v20 , 64(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v21 , 80(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v22 , 96(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v23 ,112(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,16(%%r1,%1) \n\t"
|
||||
"vl %%v18,32(%%r1,%1) \n\t"
|
||||
"vl %%v19,48(%%r1,%1) \n\t"
|
||||
"vl %%v20,0(%%r1,%2) \n\t"
|
||||
"vl %%v21,16(%%r1,%2) \n\t"
|
||||
"vl %%v22,32(%%r1,%2) \n\t"
|
||||
"vl %%v23,48(%%r1,%2) \n\t"
|
||||
"vpdi %%v24,%%v16,%%v16,4 \n\t"
|
||||
"vpdi %%v25,%%v17,%%v17,4 \n\t"
|
||||
"vpdi %%v26,%%v18,%%v18,4 \n\t"
|
||||
"vpdi %%v27,%%v19,%%v19,4 \n\t"
|
||||
|
||||
"vst %%v16 , 0(%[t1],%[y_tmp]) \n\t"
|
||||
"vst %%v17 , 16(%[t1],%[y_tmp]) \n\t"
|
||||
"vst %%v18 , 32(%[t1],%[y_tmp]) \n\t"
|
||||
"vst %%v19 , 48(%[t1],%[y_tmp]) \n\t"
|
||||
|
||||
"la %[t1],64(%[t1] ) \n\t"
|
||||
"2: \n\t"
|
||||
"pfd 1, 256(%[t1],%[x_tmp]) \n\t"
|
||||
"pfd 2, 256(%[t1],%[y_tmp]) \n\t"
|
||||
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
|
||||
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
|
||||
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
|
||||
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
|
||||
"vfmadb %%v28,%%v16,%%v0,%%v20 \n\t"
|
||||
"vfmadb %%v29,%%v17,%%v0,%%v21 \n\t"
|
||||
"vfmadb %%v30,%%v18,%%v0,%%v22 \n\t"
|
||||
"vfmadb %%v31,%%v19,%%v0,%%v23 \n\t"
|
||||
|
||||
"vfmadb %%v30, %%v20, %%v28, %%v30 \n\t"
|
||||
"vfmadb %%v31, %%v21, %%v28, %%v31 \n\t"
|
||||
"vfmadb %%v6, %%v22, %%v28, %%v6 \n\t"
|
||||
"vfmadb %%v7, %%v23, %%v28, %%v7 \n\t"
|
||||
"vl %%v16, 64(%[t1],%[y_tmp]) \n\t"
|
||||
"vl %%v17, 80(%[t1],%[y_tmp]) \n\t"
|
||||
"vl %%v18, 96(%[t1],%[y_tmp]) \n\t"
|
||||
"vl %%v19, 112(%[t1],%[y_tmp]) \n\t"
|
||||
"vfmadb %%v30, %%v24, %%v29, %%v30 \n\t"
|
||||
"vfmadb %%v31, %%v25, %%v29, %%v31 \n\t"
|
||||
"vfmadb %%v6, %%v26, %%v29, %%v6 \n\t"
|
||||
"vfmadb %%v7, %%v27, %%v29, %%v7 \n\t"
|
||||
"vfmadb %%v28,%%v24,%%v1,%%v28 \n\t"
|
||||
"vfmadb %%v29,%%v25,%%v1,%%v29 \n\t"
|
||||
"vfmadb %%v30,%%v26,%%v1,%%v30 \n\t"
|
||||
"vfmadb %%v31,%%v27,%%v1,%%v31 \n\t"
|
||||
|
||||
"vl %%v20 , 64(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v21 , 80(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v22 , 96(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v23 ,112(%[t1],%[x_tmp]) \n\t"
|
||||
"vst %%v28,0(%%r1,%2) \n\t"
|
||||
"vst %%v29,16(%%r1,%2) \n\t"
|
||||
"vst %%v30,32(%%r1,%2) \n\t"
|
||||
"vst %%v31,48(%%r1,%2) \n\t"
|
||||
|
||||
"vst %%v30 , 0(%[t1],%[y_tmp]) \n\t"
|
||||
"vst %%v31 , 16(%[t1],%[y_tmp]) \n\t"
|
||||
"vst %%v6 , 32(%[t1],%[y_tmp]) \n\t"
|
||||
"vst %%v7 , 48(%[t1],%[y_tmp]) \n\t"
|
||||
|
||||
"la %[t1],64(%[t1] ) \n\t"
|
||||
|
||||
"vl %%v16,64(%%r1,%1) \n\t"
|
||||
"vl %%v17,80(%%r1,%1) \n\t"
|
||||
"vl %%v18,96(%%r1,%1) \n\t"
|
||||
"vl %%v19,112(%%r1,%1) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"vpdi %%v24,%%v16,%%v16,4 \n\t"
|
||||
"vpdi %%v25,%%v17,%%v17,4 \n\t"
|
||||
"vpdi %%v26,%%v18,%%v18,4 \n\t"
|
||||
"vpdi %%v27,%%v19,%%v19,4 \n\t"
|
||||
|
||||
"clgrjl %[t1],%[tmp],1b \n\t"
|
||||
//----------------------------------------------------------------------
|
||||
"vfmadb %%v16, %%v20, %%v28, %%v16 \n\t"
|
||||
"vfmadb %%v17, %%v21, %%v28, %%v17 \n\t"
|
||||
"vfmadb %%v18, %%v22, %%v28, %%v18 \n\t"
|
||||
"vfmadb %%v19, %%v23, %%v28, %%v19 \n\t"
|
||||
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
|
||||
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
|
||||
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
|
||||
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
|
||||
"vfmadb %%v16, %%v24, %%v29, %%v16 \n\t"
|
||||
"vfmadb %%v17, %%v25, %%v29, %%v17 \n\t"
|
||||
"vfmadb %%v18, %%v26, %%v29, %%v18 \n\t"
|
||||
"vfmadb %%v19, %%v27, %%v29, %%v19 \n\t"
|
||||
"vfmadb %%v28,%%v16,%%v0,%%v20 \n\t"
|
||||
"vfmadb %%v29,%%v17,%%v0,%%v21 \n\t"
|
||||
"vfmadb %%v30,%%v18,%%v0,%%v22 \n\t"
|
||||
"vfmadb %%v31,%%v19,%%v0,%%v23 \n\t"
|
||||
|
||||
"vst %%v16 , 0(%[t1],%[y_tmp]) \n\t"
|
||||
"vst %%v17 , 16(%[t1],%[y_tmp]) \n\t"
|
||||
"vst %%v18 , 32(%[t1],%[y_tmp]) \n\t"
|
||||
"vst %%v19 , 48(%[t1],%[y_tmp]) \n\t"
|
||||
"vfmadb %%v28,%%v24,%%v1,%%v28 \n\t"
|
||||
"vfmadb %%v29,%%v25,%%v1,%%v29 \n\t"
|
||||
"vfmadb %%v30,%%v26,%%v1,%%v30 \n\t"
|
||||
"vfmadb %%v31,%%v27,%%v1,%%v31 \n\t"
|
||||
|
||||
: [mem_y] "+m" (*(double (*)[2*n])y),[tmp]"+&r"(n) , [t1] "=&a" (tempR1)
|
||||
: [mem_x] "m" (*(const double (*)[2*n])x), [x_tmp] "a"(x), [y_tmp] "a"(y), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i)
|
||||
: "cc", "v6","v7", "v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
"vst %%v28,64(%%r1,%2) \n\t"
|
||||
"vst %%v29,80(%%r1,%2) \n\t"
|
||||
"vst %%v30,96(%%r1,%2) \n\t"
|
||||
"vst %%v31,112(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0, iy = 0;
|
||||
FLOAT da[2];
|
||||
|
||||
if (n <= 0) return (0);
|
||||
|
||||
|
|
@ -166,8 +122,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
|||
|
||||
BLASLONG n1 = n & -8;
|
||||
|
||||
if (n1) {
|
||||
zaxpy_kernel_8(n1, x, y, da_r,da_i);
|
||||
if (n1) {
|
||||
da[0] = da_r;
|
||||
da[1] = da_i;
|
||||
zaxpy_kernel_8(n1, x, y, da);
|
||||
ix = 2 * n1;
|
||||
}
|
||||
i = n1;
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
|
@ -24,71 +24,28 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||
|
||||
__asm__ volatile(
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v28, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v28, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v29, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v30, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v31, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 112(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_y] "=m" (*(double (*)[2*n])y), [n_tmp] "+&r"(n)
|
||||
: [mem_x] "m" (*(const double (*)[2*n])x), [ptr_x] "a"(x), [ptr_y] "a"(y)
|
||||
: "cc", "r1", "v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return;
|
||||
|
||||
static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"lgr %%r1,%1 \n\t"
|
||||
"lgr %%r2,%2 \n\t"
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1) \n\t"
|
||||
"pfd 2, 1024(%%r2) \n\t"
|
||||
"mvc 0(256,%%r2),0(%%r1) \n\t"
|
||||
"agfi %%r1,256 \n\t"
|
||||
"agfi %%r2,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y)
|
||||
:"memory","cc","r0","r1","r2"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
|
|
@ -137,9 +94,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -23,137 +23,92 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#if defined(Z13)
|
||||
|
||||
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
|
||||
|
||||
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"pfd 1, 0(%[ptr_x_tmp]) \n\t"
|
||||
"pfd 1, 0(%[ptr_y_tmp]) \n\t"
|
||||
"vzero %%v24 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"vzero %%v26 \n\t"
|
||||
"vzero %%v27 \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vpdi %%v20,%%v16,%%v16,4 \n\t"
|
||||
"vpdi %%v21,%%v17,%%v17,4 \n\t"
|
||||
"vpdi %%v22,%%v18,%%v18,4 \n\t"
|
||||
"vpdi %%v23,%%v19,%%v19,4 \n\t"
|
||||
"vzero %%v24 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"vzero %%v26 \n\t"
|
||||
"vzero %%v27 \n\t"
|
||||
"vzero %%v28 \n\t"
|
||||
"vzero %%v29 \n\t"
|
||||
"vzero %%v30 \n\t"
|
||||
"vzero %%v31 \n\t"
|
||||
"srlg %%r0,%0,3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%1) \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16, 0(%%r1,%1) \n\t"
|
||||
"vl %%v17, 16(%%r1,%1) \n\t"
|
||||
"vl %%v18, 32(%%r1,%1) \n\t"
|
||||
"vl %%v19, 48(%%r1,%1) \n\t"
|
||||
"vl %%v0, 0(%%r1,%2) \n\t"
|
||||
"vl %%v1, 16(%%r1,%2) \n\t"
|
||||
"vl %%v2, 32(%%r1,%2) \n\t"
|
||||
"vl %%v3, 48(%%r1,%2) \n\t"
|
||||
"vpdi %%v20,%%v16,%%v16,4 \n\t"
|
||||
"vpdi %%v21,%%v17,%%v17,4 \n\t"
|
||||
"vpdi %%v22,%%v18,%%v18,4 \n\t"
|
||||
"vpdi %%v23,%%v19,%%v19,4 \n\t"
|
||||
|
||||
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
|
||||
"vfmadb %%v25,%%v20,%%v28,%%v25 \n\t"
|
||||
"vfmadb %%v26,%%v17,%%v29,%%v26 \n\t"
|
||||
"vfmadb %%v27,%%v21,%%v29,%%v27 \n\t"
|
||||
"vfmadb %%v24,%%v18,%%v30,%%v24 \n\t"
|
||||
"vfmadb %%v25,%%v22,%%v30,%%v25 \n\t"
|
||||
"vfmadb %%v26,%%v19,%%v31,%%v26 \n\t"
|
||||
"vfmadb %%v27,%%v23,%%v31,%%v27 \n\t"
|
||||
"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t"
|
||||
"vfmadb %%v25,%%v20,%%v0,%%v25 \n\t"
|
||||
"vfmadb %%v26,%%v17,%%v1,%%v26 \n\t"
|
||||
"vfmadb %%v27,%%v21,%%v1,%%v27 \n\t"
|
||||
"vfmadb %%v28,%%v18,%%v2,%%v28 \n\t"
|
||||
"vfmadb %%v29,%%v22,%%v2,%%v29 \n\t"
|
||||
"vfmadb %%v30,%%v19,%%v3,%%v30 \n\t"
|
||||
"vfmadb %%v31,%%v23,%%v3,%%v31 \n\t"
|
||||
|
||||
"vl %%v16, 64(%%r1,%1) \n\t"
|
||||
"vl %%v17, 80(%%r1,%1) \n\t"
|
||||
"vl %%v18, 96(%%r1,%1) \n\t"
|
||||
"vl %%v19, 112(%%r1,%1) \n\t"
|
||||
"vl %%v0, 64(%%r1,%2) \n\t"
|
||||
"vl %%v1, 80(%%r1,%2) \n\t"
|
||||
"vl %%v2, 96(%%r1,%2) \n\t"
|
||||
"vl %%v3, 112(%%r1,%2) \n\t"
|
||||
"vpdi %%v20,%%v16,%%v16,4 \n\t"
|
||||
"vpdi %%v21,%%v17,%%v17,4 \n\t"
|
||||
"vpdi %%v22,%%v18,%%v18,4 \n\t"
|
||||
"vpdi %%v23,%%v19,%%v19,4 \n\t"
|
||||
|
||||
"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t"
|
||||
"vfmadb %%v25,%%v20,%%v0,%%v25 \n\t"
|
||||
"vfmadb %%v26,%%v17,%%v1,%%v26 \n\t"
|
||||
"vfmadb %%v27,%%v21,%%v1,%%v27 \n\t"
|
||||
"vfmadb %%v28,%%v18,%%v2,%%v28 \n\t"
|
||||
"vfmadb %%v29,%%v22,%%v2,%%v29 \n\t"
|
||||
"vfmadb %%v30,%%v19,%%v3,%%v30 \n\t"
|
||||
"vfmadb %%v31,%%v23,%%v3,%%v31 \n\t"
|
||||
|
||||
"vl %%v16, 64(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v19,112(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v31,112(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vpdi %%v20,%%v16,%%v16,4 \n\t"
|
||||
"vpdi %%v21,%%v17,%%v17,4 \n\t"
|
||||
"vpdi %%v22,%%v18,%%v18,4 \n\t"
|
||||
"vpdi %%v23,%%v19,%%v19,4 \n\t"
|
||||
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
|
||||
"vfmadb %%v25,%%v20,%%v28,%%v25 \n\t"
|
||||
"vfmadb %%v26,%%v17,%%v29,%%v26 \n\t"
|
||||
"vfmadb %%v27,%%v21,%%v29,%%v27 \n\t"
|
||||
"vfmadb %%v24,%%v18,%%v30,%%v24 \n\t"
|
||||
"vfmadb %%v25,%%v22,%%v30,%%v25 \n\t"
|
||||
"vfmadb %%v26,%%v19,%%v31,%%v26 \n\t"
|
||||
"vfmadb %%v27,%%v23,%%v31,%%v27 \n\t"
|
||||
|
||||
|
||||
"la %%r1,128(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b \n\t"
|
||||
"vfadb %%v24,%%v26,%%v24 \n\t"
|
||||
"vfadb %%v25,%%v25,%%v27 \n\t"
|
||||
"vsteg %%v24, 0(%[ptr_d]),0 \n\t"
|
||||
"vsteg %%v24, 8(%[ptr_d]),1 \n\t"
|
||||
"vsteg %%v25,16(%[ptr_d]),1 \n\t"
|
||||
"vsteg %%v25,24(%[ptr_d]),0 \n\t"
|
||||
: [mem_out] "=m"(*(double (*)[4])d ) ,[n_tmp] "+&r"(n)
|
||||
: [mem_x] "m"( *(const double (*)[2*n])x),
|
||||
[mem_y] "m"( *(const double (*)[2*n])y),
|
||||
[ptr_x_tmp] "a"(x), [ptr_y_tmp] "a"(y), [ptr_d] "a"(d)
|
||||
: "cc", "r1","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
"vfadb %%v24,%%v24,%%v26 \n\t"
|
||||
"vfadb %%v24,%%v24,%%v28 \n\t"
|
||||
"vfadb %%v24,%%v24,%%v30 \n\t"
|
||||
"vfadb %%v25,%%v25,%%v27 \n\t"
|
||||
"vfadb %%v25,%%v25,%%v29 \n\t"
|
||||
"vfadb %%v25,%%v25,%%v31 \n\t"
|
||||
"vsteg %%v24,0(%3),0 \n\t"
|
||||
"vsteg %%v24,8(%3),1 \n\t"
|
||||
"vsteg %%v25,16(%3),1 \n\t"
|
||||
"vsteg %%v25,24(%3),0 "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
|
||||
BLASLONG register i = 0;
|
||||
FLOAT dot[4] = {0.0, 0.0, 0.0, 0.0};
|
||||
BLASLONG j = 0;
|
||||
|
||||
while (i < n) {
|
||||
|
||||
dot[0] += x[j] * y[j];
|
||||
dot[1] += x[j + 1] * y[j + 1];
|
||||
dot[2] += x[j] * y[j + 1];
|
||||
dot[3] += x[j + 1] * y[j];
|
||||
|
||||
dot[0] += x[j + 2] * y[j + 2];
|
||||
dot[1] += x[j + 3] * y[j + 3];
|
||||
dot[2] += x[j + 2] * y[j + 3];
|
||||
dot[3] += x[j + 3] * y[j + 2];
|
||||
|
||||
dot[0] += x[j + 4] * y[j + 4];
|
||||
dot[1] += x[j + 5] * y[j + 5];
|
||||
dot[2] += x[j + 4] * y[j + 5];
|
||||
dot[3] += x[j + 5] * y[j + 4];
|
||||
|
||||
dot[0] += x[j + 6] * y[j + 6];
|
||||
dot[1] += x[j + 7] * y[j + 7];
|
||||
dot[2] += x[j + 6] * y[j + 7];
|
||||
dot[3] += x[j + 7] * y[j + 6];
|
||||
|
||||
j += 8;
|
||||
i += 4;
|
||||
|
||||
}
|
||||
d[0] = dot[0];
|
||||
d[1] = dot[1];
|
||||
d[2] = dot[2];
|
||||
d[3] = dot[3];
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix=0, iy=0;
|
||||
BLASLONG i;
|
||||
BLASLONG ix, iy;
|
||||
OPENBLAS_COMPLEX_FLOAT result;
|
||||
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
|
||||
|
||||
|
|
@ -167,14 +122,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
|||
if ((inc_x == 1) && (inc_y == 1)) {
|
||||
|
||||
BLASLONG n1 = n & -8;
|
||||
BLASLONG j=0;
|
||||
|
||||
if (n1){
|
||||
if (n1)
|
||||
zdot_kernel_8(n1, x, y, dot);
|
||||
i = n1;
|
||||
j = n1 <<1;
|
||||
}
|
||||
|
||||
|
||||
i = n1;
|
||||
BLASLONG j = i * 2;
|
||||
|
||||
while (i < n) {
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2017, The OpenBLAS Project
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
|
@ -27,176 +27,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA)
|
||||
static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||
{
|
||||
__asm__ (
|
||||
"pfd 2, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"lgdr %%r1,%[cos] \n\t"
|
||||
"vlvgp %%v0,%%r1,%%r1 \n\t"
|
||||
"lgdr %%r1,%[sin] \n\t"
|
||||
"vlvgp %%v1,%%r1,%%r1 \n\t"
|
||||
"sllg %[tmp],%[tmp],4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 48(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27,112(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19,112(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 176(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19, 240(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 240(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"clgrjl %%r1,%[tmp],1b \n\t"
|
||||
: [mem_x] "+m" (*(double (*)[2*n])x),
|
||||
[mem_y] "+m" (*(double (*)[2*n])y),
|
||||
[tmp] "+&r"(n)
|
||||
: [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA)
|
||||
: "cc","r1" ,"v0","v1","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return;
|
||||
|
||||
__asm__ (
|
||||
"vlrepg %%v0,%3 \n\t"
|
||||
"vlrepg %%v1,%4 \n\t"
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
"vl %%v24, 0(%%r1,%1) \n\t"
|
||||
"vl %%v25, 16(%%r1,%1) \n\t"
|
||||
"vl %%v26, 32(%%r1,%1) \n\t"
|
||||
"vl %%v27, 48(%%r1,%1) \n\t"
|
||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 0(%%r1,%1) \n\t"
|
||||
"vst %%v29, 16(%%r1,%1) \n\t"
|
||||
"vst %%v30, 32(%%r1,%1) \n\t"
|
||||
"vst %%v31, 48(%%r1,%1) \n\t"
|
||||
"vst %%v20, 0(%%r1,%2) \n\t"
|
||||
"vst %%v21, 16(%%r1,%2) \n\t"
|
||||
"vst %%v22, 32(%%r1,%2) \n\t"
|
||||
"vst %%v23, 48(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 64(%%r1,%1) \n\t"
|
||||
"vl %%v25, 80(%%r1,%1) \n\t"
|
||||
"vl %%v26, 96(%%r1,%1) \n\t"
|
||||
"vl %%v27, 112(%%r1,%1) \n\t"
|
||||
"vl %%v16, 64(%%r1,%2) \n\t"
|
||||
"vl %%v17, 80(%%r1,%2) \n\t"
|
||||
"vl %%v18, 96(%%r1,%2) \n\t"
|
||||
"vl %%v19, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 64(%%r1,%1) \n\t"
|
||||
"vst %%v29, 80(%%r1,%1) \n\t"
|
||||
"vst %%v30, 96(%%r1,%1) \n\t"
|
||||
"vst %%v31, 112(%%r1,%1) \n\t"
|
||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 128(%%r1,%1) \n\t"
|
||||
"vst %%v29, 144(%%r1,%1) \n\t"
|
||||
"vst %%v30, 160(%%r1,%1) \n\t"
|
||||
"vst %%v31, 176(%%r1,%1) \n\t"
|
||||
"vst %%v20, 128(%%r1,%2) \n\t"
|
||||
"vst %%v21, 144(%%r1,%2) \n\t"
|
||||
"vst %%v22, 160(%%r1,%2) \n\t"
|
||||
"vst %%v23, 176(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 192(%%r1,%1) \n\t"
|
||||
"vl %%v25, 208(%%r1,%1) \n\t"
|
||||
"vl %%v26, 224(%%r1,%1) \n\t"
|
||||
"vl %%v27, 240(%%r1,%1) \n\t"
|
||||
"vl %%v16, 192(%%r1,%2) \n\t"
|
||||
"vl %%v17, 208(%%r1,%2) \n\t"
|
||||
"vl %%v18, 224(%%r1,%2) \n\t"
|
||||
"vl %%v19, 240(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 192(%%r1,%1) \n\t"
|
||||
"vst %%v29, 208(%%r1,%1) \n\t"
|
||||
"vst %%v30, 224(%%r1,%1) \n\t"
|
||||
"vst %%v31, 240(%%r1,%1) \n\t"
|
||||
"vst %%v20, 192(%%r1,%2) \n\t"
|
||||
"vst %%v21, 208(%%r1,%2) \n\t"
|
||||
"vst %%v22, 224(%%r1,%2) \n\t"
|
||||
"vst %%v23, 240(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
|
|
@ -214,8 +204,11 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
|
||||
BLASLONG n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
zrot_kernel_16(n1, x, y, c, s);
|
||||
{
|
||||
FLOAT cosa,sina;
|
||||
cosa=c;
|
||||
sina=s;
|
||||
zrot_kernel_16(n1, x, y, &cosa, &sina);
|
||||
i=n1;
|
||||
ix=2*n1;
|
||||
}
|
||||
|
|
@ -234,6 +227,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
@ -259,3 +253,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -23,270 +23,211 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vlrepg %%v0,0(%1) \n\t"
|
||||
"vleg %%v1,8(%1),0 \n\t"
|
||||
"wflcdb %%v1,%%v1 \n\t"
|
||||
"vleg %%v1,8(%1),1 \n\t"
|
||||
"srlg %%r0,%0,3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"vpdi %%v24,%%v16,%%v16,4 \n\t"
|
||||
"vpdi %%v25,%%v17,%%v17,4 \n\t"
|
||||
"vpdi %%v26,%%v18,%%v18,4 \n\t"
|
||||
"vpdi %%v27,%%v19,%%v19,4 \n\t"
|
||||
"vpdi %%v28,%%v20,%%v20,4 \n\t"
|
||||
"vpdi %%v29,%%v21,%%v21,4 \n\t"
|
||||
"vpdi %%v30,%%v22,%%v22,4 \n\t"
|
||||
"vpdi %%v31,%%v23,%%v23,4 \n\t"
|
||||
|
||||
"vfmdb %%v16,%%v16,%%v0 \n\t"
|
||||
"vfmdb %%v17,%%v17,%%v0 \n\t"
|
||||
"vfmdb %%v18,%%v18,%%v0 \n\t"
|
||||
"vfmdb %%v19,%%v19,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v20,%%v0 \n\t"
|
||||
"vfmdb %%v21,%%v21,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v22,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v23,%%v0 \n\t"
|
||||
"vfmadb %%v16,%%v24,%%v1,%%v16 \n\t"
|
||||
"vfmadb %%v17,%%v25,%%v1,%%v17 \n\t"
|
||||
"vfmadb %%v18,%%v26,%%v1,%%v18 \n\t"
|
||||
"vfmadb %%v19,%%v27,%%v1,%%v19 \n\t"
|
||||
"vfmadb %%v20,%%v28,%%v1,%%v20 \n\t"
|
||||
"vfmadb %%v21,%%v29,%%v1,%%v21 \n\t"
|
||||
"vfmadb %%v22,%%v30,%%v1,%%v22 \n\t"
|
||||
"vfmadb %%v23,%%v31,%%v1,%%v23 \n\t"
|
||||
|
||||
"vst %%v16,0(%%r1,%2) \n\t"
|
||||
"vst %%v17,16(%%r1,%2) \n\t"
|
||||
"vst %%v18,32(%%r1,%2) \n\t"
|
||||
"vst %%v19,48(%%r1,%2) \n\t"
|
||||
"vst %%v20,64(%%r1,%2) \n\t"
|
||||
"vst %%v21,80(%%r1,%2) \n\t"
|
||||
"vst %%v22,96(%%r1,%2) \n\t"
|
||||
"vst %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vleg %%v0,8(%1),0 \n\t"
|
||||
"wflcdb %%v0,%%v0 \n\t"
|
||||
"vleg %%v0,8(%1),1 \n\t"
|
||||
"srlg %%r0,%0,3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
static void zscal_kernel_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x) {
|
||||
BLASLONG tempR1 ;
|
||||
__asm__ (
|
||||
"pfd 2, 0(%[x_tmp]) \n\t"
|
||||
#if !defined(CONJ)
|
||||
"lgdr %[t1],%[alpha_r] \n\t"
|
||||
"vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint
|
||||
"lgdr %[t1],%[alpha_i] \n\t"
|
||||
"vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint
|
||||
"vflcdb %%v29,%%v29 \n\t" //complement both
|
||||
"vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i}
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"vpdi %%v16,%%v16,%%v16,4 \n\t"
|
||||
"vpdi %%v17,%%v17,%%v17,4 \n\t"
|
||||
"vpdi %%v18,%%v18,%%v18,4 \n\t"
|
||||
"vpdi %%v19,%%v19,%%v19,4 \n\t"
|
||||
"vpdi %%v20,%%v20,%%v20,4 \n\t"
|
||||
"vpdi %%v21,%%v21,%%v21,4 \n\t"
|
||||
"vpdi %%v22,%%v22,%%v22,4 \n\t"
|
||||
"vpdi %%v23,%%v23,%%v23,4 \n\t"
|
||||
|
||||
#else
|
||||
"lgdr %[t1],%[alpha_i] \n\t"
|
||||
"vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint
|
||||
"lgdr %[t1],%[alpha_r] \n\t"
|
||||
"vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint
|
||||
"vflcdb %%v28,%%v28 \n\t" //complement both
|
||||
"vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r}
|
||||
#endif
|
||||
|
||||
"xgr %[t1],%[t1] \n\t"
|
||||
"sllg %[tmp],%[tmp],4 \n\t"
|
||||
"vl %%v20 , 0(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v21 , 16(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v22 , 32(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v23 , 48(%[t1],%[x_tmp]) \n\t"
|
||||
|
||||
"lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition
|
||||
"j 2f \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
|
||||
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
|
||||
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
|
||||
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
|
||||
"vfmdb %%v16, %%v20, %%v28 \n\t"
|
||||
"vfmdb %%v17, %%v21, %%v28 \n\t"
|
||||
"vfmdb %%v18, %%v22, %%v28 \n\t"
|
||||
"vfmdb %%v19, %%v23, %%v28 \n\t"
|
||||
"vl %%v20, 64(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v21, 80(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v22, 96(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v23, 112(%[t1],%[x_tmp]) \n\t"
|
||||
"vfmadb %%v16, %%v24, %%v29, %%v16 \n\t"
|
||||
"vfmadb %%v17, %%v25, %%v29, %%v17 \n\t"
|
||||
"vfmadb %%v18, %%v26, %%v29, %%v18 \n\t"
|
||||
"vfmadb %%v19, %%v27, %%v29, %%v19 \n\t"
|
||||
"vfmdb %%v16,%%v16,%%v0 \n\t"
|
||||
"vfmdb %%v17,%%v17,%%v0 \n\t"
|
||||
"vfmdb %%v18,%%v18,%%v0 \n\t"
|
||||
"vfmdb %%v19,%%v19,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v20,%%v0 \n\t"
|
||||
"vfmdb %%v21,%%v21,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v22,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v23,%%v0 \n\t"
|
||||
|
||||
"vst %%v16,0(%%r1,%2) \n\t"
|
||||
"vst %%v17,16(%%r1,%2) \n\t"
|
||||
"vst %%v18,32(%%r1,%2) \n\t"
|
||||
"vst %%v19,48(%%r1,%2) \n\t"
|
||||
"vst %%v20,64(%%r1,%2) \n\t"
|
||||
"vst %%v21,80(%%r1,%2) \n\t"
|
||||
"vst %%v22,96(%%r1,%2) \n\t"
|
||||
"vst %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vst %%v16 , 0(%[t1],%[x_tmp]) \n\t"
|
||||
"vst %%v17 , 16(%[t1],%[x_tmp]) \n\t"
|
||||
"vst %%v18 , 32(%[t1],%[x_tmp]) \n\t"
|
||||
"vst %%v19 , 48(%[t1],%[x_tmp]) \n\t"
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
);
|
||||
}
|
||||
|
||||
static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vlrepg %%v0,0(%1) \n\t"
|
||||
"srlg %%r0,%0,3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v16,%%v16,%%v0 \n\t"
|
||||
"vfmdb %%v17,%%v17,%%v0 \n\t"
|
||||
"vfmdb %%v18,%%v18,%%v0 \n\t"
|
||||
"vfmdb %%v19,%%v19,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v20,%%v0 \n\t"
|
||||
"vfmdb %%v21,%%v21,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v22,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v23,%%v0 \n\t"
|
||||
|
||||
"vst %%v16,0(%%r1,%2) \n\t"
|
||||
"vst %%v17,16(%%r1,%2) \n\t"
|
||||
"vst %%v18,32(%%r1,%2) \n\t"
|
||||
"vst %%v19,48(%%r1,%2) \n\t"
|
||||
"vst %%v20,64(%%r1,%2) \n\t"
|
||||
"vst %%v21,80(%%r1,%2) \n\t"
|
||||
"vst %%v22,96(%%r1,%2) \n\t"
|
||||
"vst %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
);
|
||||
}
|
||||
|
||||
static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vzero %%v24 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"vzero %%v26 \n\t"
|
||||
"vzero %%v27 \n\t"
|
||||
"srlg %%r0,%0,3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
|
||||
"vst %%v24,0(%%r1,%1) \n\t"
|
||||
"vst %%v25,16(%%r1,%1) \n\t"
|
||||
"vst %%v26,32(%%r1,%1) \n\t"
|
||||
"vst %%v27,48(%%r1,%1) \n\t"
|
||||
"vst %%v24,64(%%r1,%1) \n\t"
|
||||
"vst %%v25,80(%%r1,%1) \n\t"
|
||||
"vst %%v26,96(%%r1,%1) \n\t"
|
||||
"vst %%v27,112(%%r1,%1) \n\t"
|
||||
|
||||
"la %[t1],64(%[t1] ) \n\t"
|
||||
"2: \n\t"
|
||||
"pfd 2, 256(%[t1],%[x_tmp]) \n\t"
|
||||
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
|
||||
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
|
||||
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
|
||||
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
|
||||
|
||||
"vfmdb %%v30, %%v20, %%v28 \n\t"
|
||||
"vfmdb %%v31, %%v21, %%v28 \n\t"
|
||||
"vfmdb %%v6, %%v22, %%v28 \n\t"
|
||||
"vfmdb %%v7, %%v23, %%v28 \n\t"
|
||||
|
||||
"vl %%v20 , 64(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v21 , 80(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v22 , 96(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v23 ,112(%[t1],%[x_tmp]) \n\t"
|
||||
|
||||
"vfmadb %%v30, %%v24, %%v29, %%v30 \n\t"
|
||||
"vfmadb %%v31, %%v25, %%v29, %%v31 \n\t"
|
||||
"vfmadb %%v6, %%v26, %%v29, %%v6 \n\t"
|
||||
"vfmadb %%v7, %%v27, %%v29, %%v7 \n\t"
|
||||
|
||||
|
||||
"vst %%v30 , 0(%[t1],%[x_tmp]) \n\t"
|
||||
"vst %%v31 , 16(%[t1],%[x_tmp]) \n\t"
|
||||
"vst %%v6 , 32(%[t1],%[x_tmp]) \n\t"
|
||||
"vst %%v7 , 48(%[t1],%[x_tmp]) \n\t"
|
||||
|
||||
"la %[t1],64(%[t1] ) \n\t"
|
||||
|
||||
|
||||
"clgrjl %[t1],%[tmp],1b \n\t"
|
||||
//----------------------------------------------------------------------
|
||||
"vfmdb %%v16, %%v20, %%v28 \n\t"
|
||||
"vfmdb %%v17, %%v21, %%v28 \n\t"
|
||||
"vfmdb %%v18, %%v22, %%v28 \n\t"
|
||||
"vfmdb %%v19, %%v23, %%v28 \n\t"
|
||||
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
|
||||
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
|
||||
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
|
||||
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
|
||||
"vfmadb %%v16, %%v24, %%v29, %%v16 \n\t"
|
||||
"vfmadb %%v17, %%v25, %%v29, %%v17 \n\t"
|
||||
"vfmadb %%v18, %%v26, %%v29, %%v18 \n\t"
|
||||
"vfmadb %%v19, %%v27, %%v29, %%v19 \n\t"
|
||||
|
||||
"vst %%v16 , 0(%[t1],%[x_tmp]) \n\t"
|
||||
"vst %%v17 , 16(%[t1],%[x_tmp]) \n\t"
|
||||
"vst %%v18 , 32(%[t1],%[x_tmp]) \n\t"
|
||||
"vst %%v19 , 48(%[t1],%[x_tmp]) \n\t"
|
||||
|
||||
: [mem_x] "+m" (*(double (*)[2*n])x),[tmp]"+&r"(n) , [t1] "=&a" (tempR1)
|
||||
: [x_tmp] "a"(x), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i)
|
||||
: "cc", "v6","v7", "v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT da_i, FLOAT *x) {
|
||||
|
||||
__asm__ ( "pfd 2, 0(%1) \n\t"
|
||||
"lgdr %%r0,%[alpha] \n\t"
|
||||
"vlvgp %%v16,%%r0,%%r0 \n\t" //load both from disjoint
|
||||
"vflcdb %%v16,%%v16 \n\t" //complement both
|
||||
"vlvgg %%v16,%%r0,0 \n\t" //restore 1st
|
||||
"vlr %%v17 ,%%v16 \n\t"
|
||||
"sllg %%r0,%[n],4 \n\t"
|
||||
"agr %%r0,%[x_ptr] \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"vl %%v24, 0(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v24,%%v24,%%v16 \n\t"
|
||||
"vsteg %%v24, 0(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v24, 8(%[x_ptr]),0 \n\t"
|
||||
"vl %%v25, 16(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v25,%%v25,%%v17 \n\t"
|
||||
"vsteg %%v25, 16(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v25, 24(%[x_ptr]),0 \n\t"
|
||||
"vl %%v26, 32(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v26,%%v26,%%v16 \n\t"
|
||||
"vsteg %%v26, 32(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v26, 40(%[x_ptr]),0 \n\t"
|
||||
"vl %%v27, 48(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v27,%%v27,%%v17 \n\t"
|
||||
"vsteg %%v27, 48(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v27, 56(%[x_ptr]),0 \n\t"
|
||||
"vl %%v28, 64(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v28,%%v28,%%v16 \n\t"
|
||||
"vsteg %%v28, 64(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v28, 72(%[x_ptr]),0 \n\t"
|
||||
"vl %%v29, 80(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v29,%%v29,%%v17 \n\t"
|
||||
"vsteg %%v29, 80(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v29, 88(%[x_ptr]),0 \n\t"
|
||||
"vl %%v30, 96(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v30,%%v30,%%v16 \n\t"
|
||||
"vsteg %%v30, 96(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v30, 104(%[x_ptr]),0 \n\t"
|
||||
"vl %%v31, 112(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v31,%%v31,%%v17 \n\t"
|
||||
"vsteg %%v31, 112(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v31, 120(%[x_ptr]),0 \n\t"
|
||||
"la %[x_ptr],128(%[x_ptr]) \n\t"
|
||||
"clgrjl %[x_ptr],%%r0,1b \n\t"
|
||||
: [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x)
|
||||
: [n] "r"(n),[alpha] "f"(da_i)
|
||||
:"cc", "r0","f0", "f1","v16","v17" ,"v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v24","v25","v26","v27"
|
||||
);
|
||||
}
|
||||
|
||||
static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT da_r, FLOAT *x) {
|
||||
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
|
||||
"lgdr %%r0,%[alpha] \n\t"
|
||||
"vlvgp %%v18,%%r0,%%r0 \n\t"
|
||||
"vlr %%v19,%%v18 \n\t"
|
||||
"vlr %%v16,%%v18 \n\t"
|
||||
"vlr %%v17,%%v18 \n\t"
|
||||
"sllg %%r0,%[n],4 \n\t"
|
||||
"agr %%r0,%[x_ptr] \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"vl %%v24, 0(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v24,%%v24,%%v18 \n\t"
|
||||
"vst %%v24, 0(%[x_ptr]) \n\t"
|
||||
"vl %%v25, 16(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v25,%%v25,%%v19 \n\t"
|
||||
"vst %%v25, 16(%[x_ptr]) \n\t"
|
||||
"vl %%v26, 32(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v26,%%v26,%%v16 \n\t"
|
||||
"vst %%v26, 32(%[x_ptr]) \n\t"
|
||||
"vl %%v27, 48(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v27,%%v27,%%v17 \n\t"
|
||||
"vst %%v27, 48(%[x_ptr]) \n\t"
|
||||
"vl %%v28, 64(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v28,%%v28,%%v18 \n\t"
|
||||
"vst %%v28, 64(%[x_ptr]) \n\t"
|
||||
"vl %%v29, 80(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v29,%%v29,%%v19 \n\t"
|
||||
"vst %%v29, 80(%[x_ptr]) \n\t"
|
||||
"vl %%v30, 96(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v30,%%v30,%%v16 \n\t"
|
||||
"vst %%v30, 96(%[x_ptr]) \n\t"
|
||||
"vl %%v31,112(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v31,%%v31,%%v17 \n\t"
|
||||
"vst %%v31,112(%[x_ptr]) \n\t"
|
||||
"la %[x_ptr],128(%[x_ptr]) \n\t"
|
||||
"clgrjl %[x_ptr],%%r0,1b \n\t"
|
||||
: [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x)
|
||||
: [n] "r"(n),[alpha] "f"(da_r)
|
||||
: "cc", "r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) {
|
||||
|
||||
__asm__ ( "pfd 2, 0(%[x_ptr]) \n\t"
|
||||
"vzero %%v24 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"vzero %%v26 \n\t"
|
||||
"vzero %%v27 \n\t"
|
||||
"sllg %%r0,%[n],4 \n\t"
|
||||
"agr %%r0,%[x_ptr] \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256( %[x_ptr]) \n\t"
|
||||
"vst %%v24, 0( %[x_ptr]) \n\t"
|
||||
"vst %%v25, 16( %[x_ptr]) \n\t"
|
||||
"vst %%v26, 32( %[x_ptr]) \n\t"
|
||||
"vst %%v27, 48( %[x_ptr]) \n\t"
|
||||
"vst %%v24, 64( %[x_ptr]) \n\t"
|
||||
"vst %%v25, 80( %[x_ptr]) \n\t"
|
||||
"vst %%v26, 96( %[x_ptr]) \n\t"
|
||||
"vst %%v27,112( %[x_ptr]) \n\t"
|
||||
|
||||
"la %[x_ptr],128(%[x_ptr]) \n\t"
|
||||
"clgrjl %[x_ptr],%%r0,1b \n\t"
|
||||
: [mem] "+m" (*(double (*)[2*n])x),[x_ptr] "+&a"(x)
|
||||
: [n] "r"(n)
|
||||
:"cc" ,"r0","v24","v25","v26","v27"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x) {
|
||||
|
||||
static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG inc_x2 = 2 * inc_x;
|
||||
BLASLONG inc_x3 = inc_x2 + inc_x;
|
||||
FLOAT t0, t1, t2, t3;
|
||||
FLOAT t0, t1, t2, t3;
|
||||
FLOAT da_r = alpha[0];
|
||||
FLOAT da_i = alpha[1];
|
||||
|
||||
for (i = 0; i < n; i += 4) {
|
||||
for (i = 0; i < n; i += 4)
|
||||
{
|
||||
t0 = da_r * x[0] - da_i * x[1];
|
||||
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
|
||||
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
|
||||
|
|
@ -303,17 +244,14 @@ static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLAS
|
|||
x[inc_x3] = t3;
|
||||
|
||||
x += 4 * inc_x;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
|
||||
BLASLONG i = 0, j = 0;
|
||||
FLOAT temp0;
|
||||
FLOAT temp1;
|
||||
|
||||
FLOAT alpha[2] __attribute__ ((aligned(16)));
|
||||
|
||||
if (inc_x != 1) {
|
||||
inc_x <<= 1;
|
||||
|
|
@ -405,8 +343,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
|||
} else {
|
||||
|
||||
BLASLONG n1 = n & -8;
|
||||
if (n1 > 0) {
|
||||
zscal_kernel_inc_8(n1, da_r,da_i, x, inc_x);
|
||||
if (n1 > 0) {
|
||||
alpha[0] = da_r;
|
||||
alpha[1] = da_i;
|
||||
zscal_kernel_inc_8(n1, alpha, x, inc_x);
|
||||
j = n1;
|
||||
i = n1 * inc_x;
|
||||
}
|
||||
|
|
@ -432,17 +372,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
|||
BLASLONG n1 = n & -8;
|
||||
if (n1 > 0) {
|
||||
|
||||
alpha[0] = da_r;
|
||||
alpha[1] = da_i;
|
||||
|
||||
if (da_r == 0.0)
|
||||
if (da_i == 0)
|
||||
zscal_kernel_8_zero(n1, x);
|
||||
else
|
||||
zscal_kernel_8_zero_r(n1, da_i, x);
|
||||
zscal_kernel_8_zero_r(n1, alpha, x);
|
||||
else
|
||||
if (da_i == 0)
|
||||
zscal_kernel_8_zero_i(n1, da_r, x);
|
||||
zscal_kernel_8_zero_i(n1, alpha, x);
|
||||
else
|
||||
zscal_kernel_8(n1, da_r,da_i, x);
|
||||
zscal_kernel_8(n1, alpha, x);
|
||||
|
||||
i = n1 << 1;
|
||||
j = n1;
|
||||
|
|
@ -508,5 +450,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
|||
|
||||
return (0);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -25,220 +25,93 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(Z13_SWAP_A)
|
||||
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v16, 0(%%r1,%[ptr_x]) \n\t"
|
||||
__asm__ volatile(
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16, 0(%%r1,%1) \n\t"
|
||||
"vl %%v17, 16(%%r1,%1) \n\t"
|
||||
"vl %%v18, 32(%%r1,%1) \n\t"
|
||||
"vl %%v19, 48(%%r1,%1) \n\t"
|
||||
"vl %%v20, 64(%%r1,%1) \n\t"
|
||||
"vl %%v21, 80(%%r1,%1) \n\t"
|
||||
"vl %%v22, 96(%%r1,%1) \n\t"
|
||||
"vl %%v23, 112(%%r1,%1) \n\t"
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vl %%v28, 192(%%r1,%1) \n\t"
|
||||
"vl %%v29, 208(%%r1,%1) \n\t"
|
||||
"vl %%v30, 224(%%r1,%1) \n\t"
|
||||
"vl %%v31, 240(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v17, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v0, 0(%%r1,%2) \n\t"
|
||||
"vl %%v1, 16(%%r1,%2) \n\t"
|
||||
"vl %%v2, 32(%%r1,%2) \n\t"
|
||||
"vl %%v3, 48(%%r1,%2) \n\t"
|
||||
"vl %%v4, 64(%%r1,%2) \n\t"
|
||||
"vl %%v5, 80(%%r1,%2) \n\t"
|
||||
"vl %%v6, 96(%%r1,%2) \n\t"
|
||||
"vl %%v7, 112(%%r1,%2) \n\t"
|
||||
"vst %%v0, 0(%%r1,%1) \n\t"
|
||||
"vst %%v1, 16(%%r1,%1) \n\t"
|
||||
"vst %%v2, 32(%%r1,%1) \n\t"
|
||||
"vst %%v3, 48(%%r1,%1) \n\t"
|
||||
"vst %%v4, 64(%%r1,%1) \n\t"
|
||||
"vst %%v5, 80(%%r1,%1) \n\t"
|
||||
"vst %%v6, 96(%%r1,%1) \n\t"
|
||||
"vst %%v7, 112(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v18, 32(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v19, 48(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v28, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v20, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v28, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v20, 64(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v29, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v21, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v29, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 80(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v30, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v22, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v30, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 96(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v31, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v23, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v31, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 112(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v16, 128(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v17, 144(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v18, 160(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v19, 176(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v20, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v20, 192(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v21, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 208(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v22, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 224(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v23, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 240(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_x] "+m" (*(double (*)[2*n])x),
|
||||
[mem_y] "+m" (*(double (*)[2*n])y),
|
||||
[n_tmp] "+&r"(n)
|
||||
: [ptr_x] "a"(x), [ptr_y] "a"(y)
|
||||
: "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
,"v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return;
|
||||
"vl %%v0, 128(%%r1,%2) \n\t"
|
||||
"vl %%v1, 144(%%r1,%2) \n\t"
|
||||
"vl %%v2, 160(%%r1,%2) \n\t"
|
||||
"vl %%v3, 176(%%r1,%2) \n\t"
|
||||
"vl %%v4, 192(%%r1,%2) \n\t"
|
||||
"vl %%v5, 208(%%r1,%2) \n\t"
|
||||
"vl %%v6, 224(%%r1,%2) \n\t"
|
||||
"vl %%v7, 240(%%r1,%2) \n\t"
|
||||
"vst %%v0, 128(%%r1,%1) \n\t"
|
||||
"vst %%v1, 144(%%r1,%1) \n\t"
|
||||
"vst %%v2, 160(%%r1,%1) \n\t"
|
||||
"vst %%v3, 176(%%r1,%1) \n\t"
|
||||
"vst %%v4, 192(%%r1,%1) \n\t"
|
||||
"vst %%v5, 208(%%r1,%1) \n\t"
|
||||
"vst %%v6, 224(%%r1,%1) \n\t"
|
||||
"vst %%v7, 240(%%r1,%1) \n\t"
|
||||
|
||||
"vst %%v16, 0(%%r1,%2) \n\t"
|
||||
"vst %%v17, 16(%%r1,%2) \n\t"
|
||||
"vst %%v18, 32(%%r1,%2) \n\t"
|
||||
"vst %%v19, 48(%%r1,%2) \n\t"
|
||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
||||
"vst %%v24, 128(%%r1,%2) \n\t"
|
||||
"vst %%v25, 144(%%r1,%2) \n\t"
|
||||
"vst %%v26, 160(%%r1,%2) \n\t"
|
||||
"vst %%v27, 176(%%r1,%2) \n\t"
|
||||
"vst %%v28, 192(%%r1,%2) \n\t"
|
||||
"vst %%v29, 208(%%r1,%2) \n\t"
|
||||
"vst %%v30, 224(%%r1,%2) \n\t"
|
||||
"vst %%v31, 240(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"pfd 2, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v16, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v20, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v21, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v22, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v23, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
|
||||
"vl %%v0, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v1, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v2, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v3, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v4, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v5, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v6, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v7, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v0, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v1, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v2, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v3, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v4, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v5, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v6, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v7, 112(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v0, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v1, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v2, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v3, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v4, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v5, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v6, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v7, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v0, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v1, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v2, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v3, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v4, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v5, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v6, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v7, 240(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vst %%v16, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v17, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v18, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v19, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_x] "+m" (*(double (*)[2*n])x),
|
||||
[mem_y] "+m" (*(double (*)[2*n])y),
|
||||
[n_tmp] "+&r"(n)
|
||||
: [ptr_x] "a"(x), [ptr_y] "a"(y)
|
||||
: "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,437 @@
|
|||
TOPDIR = ..
|
||||
include $(TOPDIR)/Makefile.system
|
||||
|
||||
goto :: sdot.goto ddot.goto cdot.goto zdot.goto dsdot.goto sswap.goto dswap.goto cswap.goto zswap.goto isamax.goto idamax.goto icamax.goto izamax.goto samax.goto damax.goto ismax.goto idmax.goto smax.goto dmax.goto isamin.goto idamin.goto icamin.goto izamin.goto samin.goto damin.goto camin.goto zamin.goto ismin.goto idmin.goto smin.goto dmin.goto sgemv.goto dgemv.goto cgemv.goto zgemv.goto sscal.goto dscal.goto cscal.goto zscal.goto saxpy.goto daxpy.goto caxpy.goto zaxpy.goto srot.goto drot.goto crot.goto zrot.goto sasum.goto dasum.goto casum.goto zasum.goto scopy.goto dcopy.goto ccopy.goto zcopy.goto
|
||||
|
||||
##################################### Sdot ####################################################
|
||||
sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Ddot ####################################################
|
||||
ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Cdot ####################################################
|
||||
cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Zdot ####################################################
|
||||
zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Dsdot ####################################################
|
||||
dsdot.goto : dsdot.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
############################################## ISAMAX ##############################################
|
||||
isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
############################################## IDAMAX ##############################################
|
||||
idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
############################################## ICAMAX ##############################################
|
||||
icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
############################################## IZAMAX ##############################################
|
||||
izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
############################################## SAMAX ##############################################
|
||||
samax.goto : samax.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
############################################## DAMAX ##############################################
|
||||
damax.goto : damax.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
############################################## ISMAX ##############################################
|
||||
ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
############################################## IDMAX ##############################################
|
||||
idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
############################################## SMAX ##############################################
|
||||
smax.goto : smax.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
############################################## DMAX ##############################################
|
||||
dmax.goto : dmax.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
############################################## ISAMIN ##############################################
|
||||
isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
############################################## IDAMIN ##############################################
|
||||
idamin.goto : idamin.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
############################################## ICAMIN ##############################################
|
||||
icamin.goto : icamin.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
############################################## IZAMIN ##############################################
|
||||
izamin.goto : izamin.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
############################################## SAMIN ##############################################
|
||||
samin.goto : samin.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
############################################## DAMIN ##############################################
|
||||
damin.goto : damin.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
############################################## CAMIN ##############################################
|
||||
camin.goto : camin.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
############################################## ZAMIN ##############################################
|
||||
zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
############################################## ISMIN ##############################################
|
||||
ismin.goto : ismin.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
############################################## IDMIN ##############################################
|
||||
idmin.goto : idmin.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
############################################## SMIN ##############################################
|
||||
smin.goto : smin.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
############################################## DMIN ##############################################
|
||||
dmin.goto : dmin.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Sgemv ####################################################
|
||||
sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Dgemv ####################################################
|
||||
dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Cgemv ####################################################
|
||||
|
||||
cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Zgemv ####################################################
|
||||
|
||||
zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Sscal ####################################################
|
||||
sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Dscal ####################################################
|
||||
dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Cscal ####################################################
|
||||
|
||||
cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Zscal ####################################################
|
||||
|
||||
zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Saxpy ####################################################
|
||||
saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Daxpy ####################################################
|
||||
daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Caxpy ####################################################
|
||||
|
||||
caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Zaxpy ####################################################
|
||||
|
||||
zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Srot ####################################################
|
||||
srot.goto : srot.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Drot ####################################################
|
||||
drot.goto : drot.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Crot ####################################################
|
||||
crot.goto : crot.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Zrot ####################################################
|
||||
zrot.goto : zrot.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Sswap ####################################################
|
||||
sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Dswap ####################################################
|
||||
dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Cswap ####################################################
|
||||
|
||||
cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Zswap ####################################################
|
||||
|
||||
zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Saxpy ####################################################
|
||||
saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Daxpy ####################################################
|
||||
daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Caxpy ####################################################
|
||||
|
||||
caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Zaxpy ####################################################
|
||||
|
||||
zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Sasum ####################################################
|
||||
sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Dasum ####################################################
|
||||
dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Casum ####################################################
|
||||
|
||||
casum.goto : casum.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Zasum ####################################################
|
||||
|
||||
zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Scopy ####################################################
|
||||
scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Dcopy ####################################################
|
||||
dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Ccopy ####################################################
|
||||
|
||||
ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
##################################### Zcopy ####################################################
|
||||
|
||||
zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
###################################################################################################
|
||||
|
||||
sdot.$(SUFFIX) : dot.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
ddot.$(SUFFIX) : dot.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
cdot.$(SUFFIX) : dot.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
zdot.$(SUFFIX) : dot.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
dsdot.$(SUFFIX) : dsdot.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
isamax.$(SUFFIX) : iamax.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
idamax.$(SUFFIX) : iamax.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
icamax.$(SUFFIX) : iamax.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
izamax.$(SUFFIX) : iamax.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
samax.$(SUFFIX) : amax.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
damax.$(SUFFIX) : amax.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
ismax.$(SUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
idmax.$(SUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
smax.$(SUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
dmax.$(SUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
isamin.$(SUFFIX) : iamin.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
idamin.$(SUFFIX) : iamin.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
icamin.$(SUFFIX) : iamin.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
izamin.$(SUFFIX) : iamin.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
samin.$(SUFFIX) : amin.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
damin.$(SUFFIX) : amin.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
camin.$(SUFFIX) : amin.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
zamin.$(SUFFIX) : amin.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
ismin.$(SUFFIX) : imin.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
idmin.$(SUFFIX) : imin.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
smin.$(SUFFIX) : min.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
dmin.$(SUFFIX) : min.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
sgemv.$(SUFFIX) : gemv.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
dgemv.$(SUFFIX) : gemv.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
cgemv.$(SUFFIX) : gemv.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
zgemv.$(SUFFIX) : gemv.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
sscal.$(SUFFIX) : scal.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
dscal.$(SUFFIX) : scal.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
cscal.$(SUFFIX) : scal.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
zscal.$(SUFFIX) : scal.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
saxpy.$(SUFFIX) : axpy.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
daxpy.$(SUFFIX) : axpy.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
caxpy.$(SUFFIX) : axpy.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
zaxpy.$(SUFFIX) : axpy.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
srot.$(SUFFIX) : rot.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
drot.$(SUFFIX) : rot.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
crot.$(SUFFIX) : rot.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
zrot.$(SUFFIX) : rot.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
sswap.$(SUFFIX) : swap.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
dswap.$(SUFFIX) : swap.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
cswap.$(SUFFIX) : swap.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
zswap.$(SUFFIX) : swap.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
saxpy.$(SUFFIX) : axpy.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
daxpy.$(SUFFIX) : axpy.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
caxpy.$(SUFFIX) : axpy.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
zaxpy.$(SUFFIX) : axpy.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
sasum.$(SUFFIX) : asum.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
dasum.$(SUFFIX) : asum.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
casum.$(SUFFIX) : asum.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
zasum.$(SUFFIX) : asum.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
scopy.$(SUFFIX) : copy.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
dcopy.$(SUFFIX) : copy.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
ccopy.$(SUFFIX) : copy.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
zcopy.$(SUFFIX) : copy.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
clean ::
|
||||
@rm -f *.goto
|
||||
|
||||
|
|
@ -0,0 +1,235 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
#define SINGLE_EPS 1e-04
|
||||
#define DOUBLE_EPS 1e-13
|
||||
|
||||
int assert_dbl_near(double exp, double real, double tol) {
|
||||
double diff = exp - real;
|
||||
double absdiff = diff;
|
||||
/* avoid using fabs and linking with a math lib */
|
||||
if(diff < 0) {
|
||||
absdiff *= -1;
|
||||
}
|
||||
if (absdiff > tol) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
FLOAT amax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT maxf=0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(maxf);
|
||||
|
||||
maxf=ABS(x[0]);
|
||||
ix += inc_x;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( ABS(x[ix]) > maxf )
|
||||
{
|
||||
maxf = ABS(x[ix]);
|
||||
}
|
||||
ix += inc_x;
|
||||
i++;
|
||||
}
|
||||
return(maxf);
|
||||
}
|
||||
|
||||
#undef AMAX
|
||||
#ifdef DOUBLE
|
||||
#define AMAX BLASFUNC(damax)
|
||||
#else
|
||||
#define AMAX BLASFUNC(samax)
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x;
|
||||
FLOAT result, result_c;
|
||||
blasint m, i;
|
||||
blasint inc_x=1;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg,timeg_c;
|
||||
|
||||
int test = 1;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops Time CTime Test\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
timeg_c=0;
|
||||
|
||||
fprintf(stderr, " %6d :", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
result = AMAX (&m, x, &inc_x);
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg += time1;
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
result_c = amax_c(m, x, inc_x);
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg_c += time1;
|
||||
|
||||
test &= assert_dbl_near(result, result_c, SINGLE_EPS);
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
timeg_c /= loops;
|
||||
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
@ -0,0 +1,235 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
#define SINGLE_EPS 1e-04
|
||||
#define DOUBLE_EPS 1e-13
|
||||
|
||||
int assert_dbl_near(double exp, double real, double tol) {
|
||||
double diff = exp - real;
|
||||
double absdiff = diff;
|
||||
/* avoid using fabs and linking with a math lib */
|
||||
if(diff < 0) {
|
||||
absdiff *= -1;
|
||||
}
|
||||
if (absdiff > tol) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
FLOAT amin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT minf=0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(minf);
|
||||
|
||||
minf=ABS(x[0]);
|
||||
ix += inc_x;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( ABS(x[ix]) < minf )
|
||||
{
|
||||
minf = ABS(x[ix]);
|
||||
}
|
||||
ix += inc_x;
|
||||
i++;
|
||||
}
|
||||
return(minf);
|
||||
}
|
||||
|
||||
#undef AMIN
|
||||
#ifdef DOUBLE
|
||||
#define AMIN BLASFUNC(damin)
|
||||
#else
|
||||
#define AMIN BLASFUNC(samin)
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x;
|
||||
FLOAT result, result_c;
|
||||
blasint m, i;
|
||||
blasint inc_x=1;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg,timeg_c;
|
||||
|
||||
int test = 1;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops Time CTime Test\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
timeg_c=0;
|
||||
|
||||
fprintf(stderr, " %6d :", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
result = AMIN (&m, x, &inc_x);
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg += time1;
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
result_c = amin_c(m, x, inc_x);
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg_c += time1;
|
||||
|
||||
test &= assert_dbl_near(result, result_c, SINGLE_EPS);
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
timeg_c /= loops;
|
||||
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
@ -0,0 +1,263 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
#define SINGLE_EPS 1e-04
|
||||
#define DOUBLE_EPS 1e-13
|
||||
|
||||
int assert_dbl_near(double exp, double real, double tol) {
|
||||
double diff = exp - real;
|
||||
double absdiff = diff;
|
||||
/* avoid using fabs and linking with a math lib */
|
||||
if(diff < 0) {
|
||||
absdiff *= -1;
|
||||
}
|
||||
if (absdiff > tol) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
#ifdef COMPLEX
|
||||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
|
||||
FLOAT zasum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
FLOAT sumf = 0.0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
n *= inc_x2;
|
||||
while(i < n)
|
||||
{
|
||||
sumf += CABS1(x,i);
|
||||
i += inc_x2;
|
||||
}
|
||||
return(sumf);
|
||||
}
|
||||
#else
|
||||
FLOAT asum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
FLOAT sumf = 0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
n *= inc_x;
|
||||
while(i < n)
|
||||
{
|
||||
sumf += ABS(x[i]);
|
||||
i += inc_x;
|
||||
}
|
||||
return(sumf);
|
||||
}
|
||||
#endif
|
||||
|
||||
#undef ASUM
|
||||
#ifdef COMPLEX
|
||||
#ifdef DOUBLE
|
||||
#define ASUM BLASFUNC(dzasum)
|
||||
#else
|
||||
#define ASUM BLASFUNC(scasum)
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
#define ASUM BLASFUNC(dasum)
|
||||
#else
|
||||
#define ASUM BLASFUNC(sasum)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x;
|
||||
FLOAT result, result_c;
|
||||
blasint m, i;
|
||||
blasint inc_x=1;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg,timeg_c;
|
||||
|
||||
int test = 1;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops Time CTime Test\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
timeg_c=0;
|
||||
|
||||
fprintf(stderr, " %6d :", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
result = ASUM (&m, x, &inc_x);
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg += time1;
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
#ifdef COMPLEX
|
||||
result_c = zasum_c(m, x, inc_x);
|
||||
#else
|
||||
result_c = asum_c(m, x, inc_x);
|
||||
#endif
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg_c += time1;
|
||||
|
||||
test &= assert_dbl_near(result, result_c, SINGLE_EPS);
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
timeg_c /= loops;
|
||||
|
||||
#ifdef COMPLEX
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 4. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
#else
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
@ -0,0 +1,303 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
#define SINGLE_EPS 1e-04
|
||||
#define DOUBLE_EPS 1e-13
|
||||
|
||||
int assert_dbl_near(double exp, double real, double tol) {
|
||||
double diff = exp - real;
|
||||
double absdiff = diff;
|
||||
/* avoid using fabs and linking with a math lib */
|
||||
if(diff < 0) {
|
||||
absdiff *= -1;
|
||||
}
|
||||
if (absdiff > tol) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifdef COMPLEX
|
||||
int zaxpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix,iy;
|
||||
BLASLONG inc_x2;
|
||||
BLASLONG inc_y2;
|
||||
|
||||
if ( n < 0 ) return(0);
|
||||
if ( da_r == 0.0 && da_i == 0.0 ) return(0);
|
||||
|
||||
ix = 0;
|
||||
iy = 0;
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
#if !defined(CONJ)
|
||||
y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
|
||||
y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
|
||||
#else
|
||||
y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
|
||||
y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
|
||||
#endif
|
||||
ix += inc_x2 ;
|
||||
iy += inc_y2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
#else
|
||||
int axpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix,iy;
|
||||
|
||||
if ( n < 0 ) return(0);
|
||||
if ( da == 0.0 ) return(0);
|
||||
|
||||
ix = 0;
|
||||
iy = 0;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
y[iy] += da * x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
#undef AXPY
|
||||
#ifdef COMPLEX
|
||||
#ifdef DOUBLE
|
||||
#define AXPY BLASFUNC(zaxpy)
|
||||
#else
|
||||
#define AXPY BLASFUNC(caxpy)
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
#define AXPY BLASFUNC(daxpy)
|
||||
#else
|
||||
#define AXPY BLASFUNC(saxpy)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x, *y, *y_c;;
|
||||
FLOAT alpha[2] = { 2.0, 2.0 };
|
||||
blasint m, i;
|
||||
blasint inc_x=1,inc_y=1;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg,timeg_c;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
blasint iy;
|
||||
int test = 1;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops Time CTime Test\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
timeg_c=0;
|
||||
|
||||
fprintf(stderr, " %6d :", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
y_c[i] = y[i];
|
||||
}
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
AXPY (&m, alpha, x, &inc_x, y, &inc_y );
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg += time1;
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
#ifdef COMPLEX
|
||||
zaxpy_c(m, 0, 0, alpha[0], alpha[1], x, inc_x, y_c, inc_y, NULL, 0);
|
||||
#else
|
||||
axpy_c(m, 0, 0, *alpha, x, inc_x, y_c, inc_y, NULL, 0);
|
||||
#endif
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg_c += time1;
|
||||
|
||||
iy = 0;
|
||||
#ifdef COMPLEX
|
||||
for (i = 0; i < m * 2; i++)
|
||||
#else
|
||||
for (i = 0; i < m; i++)
|
||||
#endif
|
||||
{
|
||||
test &= assert_dbl_near(y[iy], y_c[iy], SINGLE_EPS);
|
||||
iy += inc_y;
|
||||
}
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
timeg_c /= loops;
|
||||
|
||||
#ifdef COMPLEX
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
#else
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
@ -0,0 +1,291 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
#define SINGLE_EPS 1e-04
|
||||
#define DOUBLE_EPS 1e-13
|
||||
|
||||
int assert_dbl_near(double exp, double real, double tol) {
|
||||
double diff = exp - real;
|
||||
double absdiff = diff;
|
||||
/* avoid using fabs and linking with a math lib */
|
||||
if(diff < 0) {
|
||||
absdiff *= -1;
|
||||
}
|
||||
if (absdiff > tol) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifdef COMPLEX
|
||||
int zcopy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
BLASLONG inc_x2;
|
||||
BLASLONG inc_y2;
|
||||
|
||||
if ( n < 0 ) return(0);
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
y[iy] = x[ix] ;
|
||||
y[iy+1] = x[ix+1] ;
|
||||
ix += inc_x2;
|
||||
iy += inc_y2;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
#else
|
||||
int copy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if ( n < 0 ) return(0);
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
y[iy] = x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
#undef COPY
|
||||
#ifdef COMPLEX
|
||||
#ifdef DOUBLE
|
||||
#define COPY BLASFUNC(zcopy)
|
||||
#else
|
||||
#define COPY BLASFUNC(ccopy)
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
#define COPY BLASFUNC(dcopy)
|
||||
#else
|
||||
#define COPY BLASFUNC(scopy)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x, *y, *y_c;
|
||||
blasint m, i;
|
||||
blasint inc_x=1,inc_y=1;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg,timeg_c;
|
||||
|
||||
blasint iy;
|
||||
int test = 1;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops Time CTime Test\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
timeg_c=0;
|
||||
|
||||
fprintf(stderr, " %6d :", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
y_c[i] = y[i];
|
||||
}
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
COPY (&m, x, &inc_x, y, &inc_y );
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg += time1;
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
#ifdef COMPLEX
|
||||
zcopy_c(m, x, inc_x, y_c, inc_y);
|
||||
#else
|
||||
copy_c(m, x, inc_x, y_c, inc_y);
|
||||
#endif
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg_c += time1;
|
||||
|
||||
iy = 0;
|
||||
#ifdef COMPLEX
|
||||
for (i = 0; i < m * 2; i++)
|
||||
#else
|
||||
for (i = 0; i < m; i++)
|
||||
#endif
|
||||
{
|
||||
test &= assert_dbl_near(y[iy], y_c[iy], SINGLE_EPS);
|
||||
iy += inc_y;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
timeg_c /= loops;
|
||||
|
||||
#ifdef COMPLEX
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 6. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
#else
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
@ -0,0 +1,296 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
#define SINGLE_EPS 1e-04
|
||||
#define DOUBLE_EPS 1e-13
|
||||
|
||||
int assert_dbl_near(double exp, double real, double tol) {
|
||||
double diff = exp - real;
|
||||
double absdiff = diff;
|
||||
/* avoid using fabs and linking with a math lib */
|
||||
if(diff < 0) {
|
||||
absdiff *= -1;
|
||||
}
|
||||
if (absdiff > tol) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifdef COMPLEX
|
||||
OPENBLAS_COMPLEX_FLOAT zdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT dot[2];
|
||||
OPENBLAS_COMPLEX_FLOAT result;
|
||||
BLASLONG inc_x2;
|
||||
BLASLONG inc_y2;
|
||||
|
||||
dot[0]=0.0;
|
||||
dot[1]=0.0;
|
||||
|
||||
CREAL(result) = 0.0 ;
|
||||
CIMAG(result) = 0.0 ;
|
||||
|
||||
if ( n < 1 ) return(result);
|
||||
|
||||
inc_x2 = 2 * inc_x ;
|
||||
inc_y2 = 2 * inc_y ;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
#if !defined(CONJ)
|
||||
dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ;
|
||||
dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ;
|
||||
#else
|
||||
dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ;
|
||||
dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ;
|
||||
#endif
|
||||
ix += inc_x2 ;
|
||||
iy += inc_y2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
CREAL(result) = dot[0];
|
||||
CIMAG(result) = dot[1];
|
||||
return(result);
|
||||
|
||||
}
|
||||
#else
|
||||
FLOAT dot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT dot = 0.0 ;
|
||||
|
||||
if ( n < 0 ) return(dot);
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[iy] * x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(dot);
|
||||
}
|
||||
#endif
|
||||
|
||||
#undef DOT
|
||||
#ifdef COMPLEX
|
||||
#ifdef DOUBLE
|
||||
#define DOT BLASFUNC(zdotu)
|
||||
#else
|
||||
#define DOT BLASFUNC(cdotu)
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
#define DOT BLASFUNC(ddot)
|
||||
#else
|
||||
#define DOT BLASFUNC(sdot)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x, *y;
|
||||
#ifdef COMPLEX
|
||||
OPENBLAS_COMPLEX_FLOAT result, result_c;
|
||||
#else
|
||||
FLOAT result, result_c;
|
||||
#endif
|
||||
blasint m, i;
|
||||
blasint inc_x=1,inc_y=1;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg,timeg_c;
|
||||
|
||||
int test = 1;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops Time CTime Test\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
timeg_c=0;
|
||||
|
||||
fprintf(stderr, " %6d :", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++) {
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++) {
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
result = DOT(&m, x, &inc_x, y, &inc_y);
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg += time1;
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
#ifdef COMPLEX
|
||||
result_c = zdot_c(m, x, inc_x, y, inc_y);
|
||||
#else
|
||||
result_c = dot_c(m, x, inc_x, y, inc_y);
|
||||
#endif
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg_c += time1;
|
||||
|
||||
#ifdef COMPLEX
|
||||
test &= assert_dbl_near(CREAL(result), CREAL(result_c), SINGLE_EPS);
|
||||
test &= assert_dbl_near(CIMAG(result), CIMAG(result_c), SINGLE_EPS);
|
||||
#else
|
||||
test &= assert_dbl_near(result, result_c, SINGLE_EPS);
|
||||
#endif
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
timeg_c /= loops;
|
||||
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
@ -0,0 +1,229 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
#define SINGLE_EPS 1e-04
|
||||
#define DOUBLE_EPS 1e-13
|
||||
|
||||
int assert_dbl_near(double exp, double real, double tol) {
|
||||
double diff = exp - real;
|
||||
double absdiff = diff;
|
||||
/* avoid using fabs and linking with a math lib */
|
||||
if(diff < 0) {
|
||||
absdiff *= -1;
|
||||
}
|
||||
if (absdiff > tol) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
double dsdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
double dot = 0.0 ;
|
||||
|
||||
if ( n < 0 ) return(dot);
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[iy] * x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(dot);
|
||||
}
|
||||
|
||||
#undef DSDOT
|
||||
#define DSDOT BLASFUNC(dsdot)
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x, *y;
|
||||
double result, result_c;
|
||||
blasint m, i;
|
||||
blasint inc_x=1,inc_y=1;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg,timeg_c;
|
||||
|
||||
int test = 1;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops Time CTime Test\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
timeg_c=0;
|
||||
|
||||
fprintf(stderr, " %6d :", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++) {
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++) {
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
result = DSDOT(&m, x, &inc_x, y, &inc_y);
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg += time1;
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
result_c = dsdot_c(m, x, inc_x, y, inc_y);
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg_c += time1;
|
||||
|
||||
test &= assert_dbl_near(result, result_c, SINGLE_EPS);
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
timeg_c /= loops;
|
||||
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
@ -0,0 +1,618 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
#define SINGLE_EPS 1e-04
|
||||
#define DOUBLE_EPS 1e-13
|
||||
|
||||
int assert_dbl_near(double exp, double real, double tol) {
|
||||
double diff = exp - real;
|
||||
double absdiff = diff;
|
||||
/* avoid using fabs and linking with a math lib */
|
||||
if(diff < 0) {
|
||||
absdiff *= -1;
|
||||
}
|
||||
if (absdiff > tol) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifdef COMPLEX
|
||||
int zgemv_n_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG ix,iy;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT temp_r,temp_i;
|
||||
BLASLONG inc_x2,inc_y2;
|
||||
BLASLONG lda2;
|
||||
BLASLONG i2;
|
||||
|
||||
lda2 = 2*lda;
|
||||
|
||||
ix = 0;
|
||||
a_ptr = a;
|
||||
|
||||
if ( inc_x == 1 && inc_y == 1 )
|
||||
{
|
||||
|
||||
for (j=0; j<n; j++)
|
||||
{
|
||||
|
||||
#if !defined(XCONJ)
|
||||
temp_r = alpha_r * x[ix] - alpha_i * x[ix+1];
|
||||
temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
|
||||
#else
|
||||
temp_r = alpha_r * x[ix] + alpha_i * x[ix+1];
|
||||
temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
|
||||
#endif
|
||||
iy = 0;
|
||||
i2=0;
|
||||
|
||||
for (i=0; i<m; i++)
|
||||
{
|
||||
#if !defined(CONJ)
|
||||
|
||||
#if !defined(XCONJ)
|
||||
printf("\nParO: %f %f %f %f\n", a_ptr[i2], a_ptr[i2+1], temp_r, temp_i);
|
||||
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
|
||||
y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
|
||||
#else
|
||||
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
|
||||
y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
|
||||
y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
|
||||
#else
|
||||
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
|
||||
y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
|
||||
#endif
|
||||
|
||||
#endif
|
||||
i2 += 2;
|
||||
iy += 2;
|
||||
}
|
||||
a_ptr += lda2;
|
||||
ix += 2;
|
||||
}
|
||||
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
for (j=0; j<n; j++)
|
||||
{
|
||||
|
||||
#if !defined(XCONJ)
|
||||
temp_r = alpha_r * x[ix] - alpha_i * x[ix+1];
|
||||
temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
|
||||
#else
|
||||
temp_r = alpha_r * x[ix] + alpha_i * x[ix+1];
|
||||
temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
|
||||
#endif
|
||||
iy = 0;
|
||||
i2=0;
|
||||
|
||||
for (i=0; i<m; i++)
|
||||
{
|
||||
#if !defined(CONJ)
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
|
||||
y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
|
||||
#else
|
||||
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
|
||||
y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
|
||||
y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
|
||||
#else
|
||||
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
|
||||
y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
|
||||
#endif
|
||||
|
||||
#endif
|
||||
i2 += 2;
|
||||
iy += inc_y2;
|
||||
}
|
||||
a_ptr += lda2;
|
||||
ix += inc_x2;
|
||||
}
|
||||
|
||||
|
||||
return(0);
|
||||
}
|
||||
int zgemv_t_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG ix,iy;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT temp_r,temp_i;
|
||||
BLASLONG inc_x2,inc_y2;
|
||||
BLASLONG lda2;
|
||||
BLASLONG i2;
|
||||
|
||||
lda2 = 2*lda;
|
||||
|
||||
iy = 0;
|
||||
a_ptr = a;
|
||||
|
||||
if ( inc_x == 1 && inc_y == 1 )
|
||||
{
|
||||
|
||||
for (j=0; j<n; j++)
|
||||
{
|
||||
temp_r = 0.0;
|
||||
temp_i = 0.0;
|
||||
ix = 0;
|
||||
i2=0;
|
||||
|
||||
for (i=0; i<m; i++)
|
||||
{
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r += a_ptr[i2] * x[ix] - a_ptr[i2+1] * x[ix+1];
|
||||
temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
|
||||
#else
|
||||
temp_r += a_ptr[i2] * x[ix] + a_ptr[i2+1] * x[ix+1];
|
||||
temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
|
||||
#endif
|
||||
|
||||
i2 += 2;
|
||||
ix += 2;
|
||||
}
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y[iy] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
#else
|
||||
y[iy] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
#endif
|
||||
|
||||
a_ptr += lda2;
|
||||
iy += 2;
|
||||
}
|
||||
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
for (j=0; j<n; j++)
|
||||
{
|
||||
temp_r = 0.0;
|
||||
temp_i = 0.0;
|
||||
ix = 0;
|
||||
i2=0;
|
||||
|
||||
for (i=0; i<m; i++)
|
||||
{
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r += a_ptr[i2] * x[ix] - a_ptr[i2+1] * x[ix+1];
|
||||
temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
|
||||
#else
|
||||
temp_r += a_ptr[i2] * x[ix] + a_ptr[i2+1] * x[ix+1];
|
||||
temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
|
||||
#endif
|
||||
|
||||
i2 += 2;
|
||||
ix += inc_x2;
|
||||
}
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y[iy] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
#else
|
||||
y[iy] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
#endif
|
||||
|
||||
a_ptr += lda2;
|
||||
iy += inc_y2;
|
||||
}
|
||||
|
||||
return(0);
|
||||
|
||||
}
|
||||
#else
|
||||
int gemv_n_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG ix,iy;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT temp;
|
||||
|
||||
ix = 0;
|
||||
a_ptr = a;
|
||||
|
||||
for (j=0; j<n; j++)
|
||||
{
|
||||
temp = alpha * x[ix];
|
||||
iy = 0;
|
||||
for (i=0; i<m; i++)
|
||||
{
|
||||
y[iy] += temp * a_ptr[i];
|
||||
iy += inc_y;
|
||||
}
|
||||
a_ptr += lda;
|
||||
ix += inc_x;
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
int gemv_t_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG ix,iy;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT temp;
|
||||
|
||||
iy = 0;
|
||||
a_ptr = a;
|
||||
|
||||
for (j=0; j<n; j++)
|
||||
{
|
||||
temp = 0.0;
|
||||
ix = 0;
|
||||
for (i=0; i<m; i++)
|
||||
{
|
||||
temp += a_ptr[i] * x[ix];
|
||||
ix += inc_x;
|
||||
}
|
||||
y[iy] += alpha * temp;
|
||||
iy += inc_y;
|
||||
a_ptr += lda;
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#undef GEMV
|
||||
#ifndef COMPLEX
|
||||
#ifdef DOUBLE
|
||||
#define GEMV BLASFUNC(dgemv)
|
||||
#else
|
||||
#define GEMV BLASFUNC(sgemv)
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
#define GEMV BLASFUNC(zgemv)
|
||||
#else
|
||||
#define GEMV BLASFUNC(cgemv)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a, *x, *y, *y_c;
|
||||
FLOAT alpha[] = {1.0, 1.0};
|
||||
FLOAT beta [] = {1.0, 1.0};
|
||||
char trans='N';
|
||||
blasint m, i, j;
|
||||
blasint inc_x=1,inc_y=1;
|
||||
blasint n=0;
|
||||
int has_param_n = 0;
|
||||
int has_param_m = 0;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg,timeg_c;
|
||||
|
||||
blasint iy;
|
||||
int test = 1;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
|
||||
int tomax = to;
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
|
||||
if ((p = getenv("OPENBLAS_PARAM_N"))) {
|
||||
n = atoi(p);
|
||||
if ((n>0)) has_param_n = 1;
|
||||
if ( n > tomax ) tomax = n;
|
||||
}
|
||||
if ( has_param_n == 0 )
|
||||
if ((p = getenv("OPENBLAS_PARAM_M"))) {
|
||||
m = atoi(p);
|
||||
if ((m>0)) has_param_m = 1;
|
||||
if ( m > tomax ) tomax = m;
|
||||
}
|
||||
|
||||
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops);
|
||||
|
||||
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops Time CTime Test\n");
|
||||
|
||||
if (has_param_m == 0)
|
||||
{
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
timeg=0;
|
||||
timeg_c=0;
|
||||
if ( has_param_n == 0 ) n = m;
|
||||
fprintf(stderr, " %6dx%d :", (int)m,(int)n);
|
||||
for(j = 0; j < m; j++){
|
||||
for(i = 0; i < n * COMPSIZE; i++){
|
||||
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
y_c[i]= y[i];
|
||||
}
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg += time1;
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
#ifdef COMPLEX
|
||||
if (trans == 'N')
|
||||
zgemv_n_c(m, n, 0, alpha[0], alpha[1], a, m, x, inc_x, y_c, inc_y);
|
||||
else
|
||||
zgemv_t_c(m, n, 0, alpha[0], alpha[1], a, m, x, inc_x, y_c, inc_y);
|
||||
#else
|
||||
if (trans == 'N')
|
||||
gemv_n_c(m, n, 0, *alpha, a, m, x, inc_x, y_c, inc_y);
|
||||
else
|
||||
gemv_t_c(m, n, 0, *alpha, a, m, x, inc_x, y_c, inc_y);
|
||||
#endif
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg_c += time1;
|
||||
|
||||
iy = 0;
|
||||
#ifdef COMPLEX
|
||||
for (i = 0; i < m * 2; i++)
|
||||
#else
|
||||
for (i = 0; i < m; i++)
|
||||
#endif
|
||||
{
|
||||
test &= assert_dbl_near(y[iy], y_c[iy], SINGLE_EPS);
|
||||
iy += inc_y;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
timeg_c /= loops;
|
||||
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for(n = from; n <= to; n += step)
|
||||
{
|
||||
timeg=0;
|
||||
timeg_c=0;
|
||||
fprintf(stderr, " %6dx%d :", (int)m,(int)n);
|
||||
for(j = 0; j < m; j++){
|
||||
for(i = 0; i < n * COMPSIZE; i++){
|
||||
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
y_c[i]= y[i];
|
||||
}
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg += time1;
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
#ifdef COMPLEX
|
||||
if (trans == 'N')
|
||||
zgemv_n_c(m, n, 0, alpha[0], alpha[1], a, m, x, inc_x, y_c, inc_y);
|
||||
else
|
||||
zgemv_t_c(m, n, 0, alpha[0], alpha[1], a, m, x, inc_x, y_c, inc_y);
|
||||
#else
|
||||
if (trans == 'N')
|
||||
gemv_n_c(m, n, 0, *alpha, a, m, x, inc_x, y_c, inc_y);
|
||||
else
|
||||
gemv_t_c(m, n, 0, *alpha, a, m, x, inc_x, y_c, inc_y);
|
||||
#endif
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg_c += time1;
|
||||
|
||||
iy = 0;
|
||||
#ifdef COMPLEX
|
||||
for (i = 0; i < m * 2; i++)
|
||||
#else
|
||||
for (i = 0; i < m; i++)
|
||||
#endif
|
||||
{
|
||||
test &= assert_dbl_near(y[iy], y_c[iy], SINGLE_EPS);
|
||||
iy += inc_y;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
timeg_c /= loops;
|
||||
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
@ -0,0 +1,284 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
#define SINGLE_EPS 1e-04
|
||||
#define DOUBLE_EPS 1e-13
|
||||
|
||||
int assert_dbl_near(double exp, double real, double tol) {
|
||||
double diff = exp - real;
|
||||
double absdiff = diff;
|
||||
/* avoid using fabs and linking with a math lib */
|
||||
if(diff < 0) {
|
||||
absdiff *= -1;
|
||||
}
|
||||
if (absdiff > tol) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
#ifdef COMPLEX
|
||||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
|
||||
BLASLONG izamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT maxf;
|
||||
BLASLONG max=0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(max);
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
maxf = CABS1(x,0);
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) > maxf )
|
||||
{
|
||||
max = i;
|
||||
maxf = CABS1(x,ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
}
|
||||
return(max+1);
|
||||
}
|
||||
#else
|
||||
BLASLONG iamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT maxf=0.0;
|
||||
BLASLONG max=0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(max);
|
||||
|
||||
maxf=ABS(x[0]);
|
||||
ix += inc_x;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( ABS(x[ix]) > maxf )
|
||||
{
|
||||
max = i;
|
||||
maxf = ABS(x[ix]);
|
||||
}
|
||||
ix += inc_x;
|
||||
i++;
|
||||
}
|
||||
return(max+1);
|
||||
}
|
||||
#endif
|
||||
|
||||
#undef IAMAX
|
||||
#ifdef COMPLEX
|
||||
#ifdef DOUBLE
|
||||
#define IAMAX BLASFUNC(izamax)
|
||||
#else
|
||||
#define IAMAX BLASFUNC(icamax)
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
#define IAMAX BLASFUNC(idamax)
|
||||
#else
|
||||
#define IAMAX BLASFUNC(isamax)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x;
|
||||
BLASLONG result, result_c;
|
||||
blasint m, i;
|
||||
blasint inc_x=1;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg,timeg_c;
|
||||
|
||||
int test = 1;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops Time CTime Test\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
timeg_c=0;
|
||||
|
||||
fprintf(stderr, " %6d :", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
result = IAMAX (&m, x, &inc_x);
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg += time1;
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
#ifdef COMPLEX
|
||||
result_c = izamax_c(m, x, inc_x);
|
||||
#else
|
||||
result_c = iamax_c(m, x, inc_x);
|
||||
#endif
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg_c += time1;
|
||||
|
||||
test &= (result == result_c);
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
timeg_c /= loops;
|
||||
|
||||
#ifdef COMPLEX
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 6. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
#else
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
@ -0,0 +1,284 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
#define SINGLE_EPS 1e-04
|
||||
#define DOUBLE_EPS 1e-13
|
||||
|
||||
int assert_dbl_near(double exp, double real, double tol) {
|
||||
double diff = exp - real;
|
||||
double absdiff = diff;
|
||||
/* avoid using fabs and linking with a math lib */
|
||||
if(diff < 0) {
|
||||
absdiff *= -1;
|
||||
}
|
||||
if (absdiff > tol) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
#ifdef COMPLEX
|
||||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
|
||||
BLASLONG izamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT minf;
|
||||
BLASLONG min=0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(min);
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
minf = CABS1(x,0);
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) < minf )
|
||||
{
|
||||
min = i;
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
}
|
||||
return(min+1);
|
||||
}
|
||||
#else
|
||||
BLASLONG iamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT minf=0.0;
|
||||
BLASLONG min=0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(min);
|
||||
|
||||
minf=ABS(x[0]);
|
||||
ix += inc_x;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( ABS(x[ix]) < minf )
|
||||
{
|
||||
min = i;
|
||||
minf = ABS(x[ix]);
|
||||
}
|
||||
ix += inc_x;
|
||||
i++;
|
||||
}
|
||||
return(min+1);
|
||||
}
|
||||
#endif
|
||||
|
||||
#undef IAMIN
|
||||
#ifdef COMPLEX
|
||||
#ifdef DOUBLE
|
||||
#define IAMIN BLASFUNC(izamin)
|
||||
#else
|
||||
#define IAMIN BLASFUNC(icamin)
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
#define IAMIN BLASFUNC(idamin)
|
||||
#else
|
||||
#define IAMIN BLASFUNC(isamin)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x;
|
||||
BLASLONG result, result_c;
|
||||
blasint m, i;
|
||||
blasint inc_x=1;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg,timeg_c;
|
||||
|
||||
int test = 1;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops Time CTime Test\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
timeg_c=0;
|
||||
|
||||
fprintf(stderr, " %6d :", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
result = IAMIN (&m, x, &inc_x);
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg += time1;
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
#ifdef COMPLEX
|
||||
result_c = izamin_c(m, x, inc_x);
|
||||
#else
|
||||
result_c = iamin_c(m, x, inc_x);
|
||||
#endif
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg_c += time1;
|
||||
|
||||
test &= (result == result_c);
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
timeg_c /= loops;
|
||||
|
||||
#ifdef COMPLEX
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 6. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
#else
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
@ -0,0 +1,231 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
#define SINGLE_EPS 1e-04
|
||||
#define DOUBLE_EPS 1e-13
|
||||
|
||||
int assert_dbl_near(double exp, double real, double tol) {
|
||||
double diff = exp - real;
|
||||
double absdiff = diff;
|
||||
/* avoid using fabs and linking with a math lib */
|
||||
if(diff < 0) {
|
||||
absdiff *= -1;
|
||||
}
|
||||
if (absdiff > tol) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
BLASLONG imax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT maxf=0.0;
|
||||
BLASLONG max=0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(max);
|
||||
|
||||
maxf=x[0];
|
||||
ix += inc_x;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( x[ix] > maxf )
|
||||
{
|
||||
max = i;
|
||||
maxf = x[ix];
|
||||
}
|
||||
ix += inc_x;
|
||||
i++;
|
||||
}
|
||||
return(max+1);
|
||||
}
|
||||
|
||||
#undef IMAX
|
||||
#ifdef DOUBLE
|
||||
#define IMAX BLASFUNC(idmax)
|
||||
#else
|
||||
#define IMAX BLASFUNC(ismax)
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x;
|
||||
BLASLONG result, result_c;
|
||||
blasint m, i;
|
||||
blasint inc_x=1;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg,timeg_c;
|
||||
|
||||
int test = 1;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops Time CTime Test\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
timeg_c=0;
|
||||
|
||||
fprintf(stderr, " %6d :", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
result = IMAX (&m, x, &inc_x);
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg += time1;
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
result_c = imax_c(m, x, inc_x);
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg_c += time1;
|
||||
|
||||
test &= (result == result_c);
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
timeg_c /= loops;
|
||||
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
@ -0,0 +1,231 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
#define SINGLE_EPS 1e-04
|
||||
#define DOUBLE_EPS 1e-13
|
||||
|
||||
int assert_dbl_near(double exp, double real, double tol) {
|
||||
double diff = exp - real;
|
||||
double absdiff = diff;
|
||||
/* avoid using fabs and linking with a math lib */
|
||||
if(diff < 0) {
|
||||
absdiff *= -1;
|
||||
}
|
||||
if (absdiff > tol) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
BLASLONG imin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT minf=0.0;
|
||||
BLASLONG min=0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(min);
|
||||
|
||||
minf=x[0];
|
||||
ix += inc_x;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( x[ix] < minf )
|
||||
{
|
||||
min = i;
|
||||
minf = x[ix];
|
||||
}
|
||||
ix += inc_x;
|
||||
i++;
|
||||
}
|
||||
return(min+1);
|
||||
}
|
||||
|
||||
#undef IMIN
|
||||
#ifdef DOUBLE
|
||||
#define IMIN BLASFUNC(idmin)
|
||||
#else
|
||||
#define IMIN BLASFUNC(ismin)
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x;
|
||||
BLASLONG result, result_c;
|
||||
blasint m, i;
|
||||
blasint inc_x=1;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg,timeg_c;
|
||||
|
||||
int test = 1;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops Time CTime Test\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
timeg_c=0;
|
||||
|
||||
fprintf(stderr, " %6d :", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
result = IMIN (&m, x, &inc_x);
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg += time1;
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
result_c = imin_c(m, x, inc_x);
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg_c += time1;
|
||||
|
||||
test &= (result == result_c);
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
timeg_c /= loops;
|
||||
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
@ -0,0 +1,229 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
#define SINGLE_EPS 1e-04
|
||||
#define DOUBLE_EPS 1e-13
|
||||
|
||||
int assert_dbl_near(double exp, double real, double tol) {
|
||||
double diff = exp - real;
|
||||
double absdiff = diff;
|
||||
/* avoid using fabs and linking with a math lib */
|
||||
if(diff < 0) {
|
||||
absdiff *= -1;
|
||||
}
|
||||
if (absdiff > tol) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
FLOAT max_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT maxf=0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(maxf);
|
||||
|
||||
maxf=x[0];
|
||||
ix += inc_x;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( x[ix] > maxf )
|
||||
{
|
||||
maxf = x[ix];
|
||||
}
|
||||
ix += inc_x;
|
||||
i++;
|
||||
}
|
||||
return(maxf);
|
||||
}
|
||||
|
||||
#undef MAX_
|
||||
#ifdef DOUBLE
|
||||
#define MAX_ BLASFUNC(dmax)
|
||||
#else
|
||||
#define MAX_ BLASFUNC(smax)
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x;
|
||||
FLOAT result, result_c;
|
||||
blasint m, i;
|
||||
blasint inc_x=1;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg,timeg_c;
|
||||
|
||||
int test = 1;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops Time CTime Test\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
timeg_c=0;
|
||||
|
||||
fprintf(stderr, " %6d :", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
result = MAX_ (&m, x, &inc_x);
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg += time1;
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
result_c = max_c(m, x, inc_x);
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg_c += time1;
|
||||
|
||||
test &= assert_dbl_near(result, result_c, SINGLE_EPS);
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
timeg_c /= loops;
|
||||
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
@ -0,0 +1,229 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
#define SINGLE_EPS 1e-04
|
||||
#define DOUBLE_EPS 1e-13
|
||||
|
||||
int assert_dbl_near(double exp, double real, double tol) {
|
||||
double diff = exp - real;
|
||||
double absdiff = diff;
|
||||
/* avoid using fabs and linking with a math lib */
|
||||
if(diff < 0) {
|
||||
absdiff *= -1;
|
||||
}
|
||||
if (absdiff > tol) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
FLOAT min_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT minf=0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(minf);
|
||||
|
||||
minf=x[0];
|
||||
ix += inc_x;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( x[ix] < minf )
|
||||
{
|
||||
minf = x[ix];
|
||||
}
|
||||
ix += inc_x;
|
||||
i++;
|
||||
}
|
||||
return(minf);
|
||||
}
|
||||
|
||||
#undef MIN_
|
||||
#ifdef DOUBLE
|
||||
#define MIN_ BLASFUNC(dmin)
|
||||
#else
|
||||
#define MIN_ BLASFUNC(smin)
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x;
|
||||
FLOAT result, result_c;
|
||||
blasint m, i;
|
||||
blasint inc_x=1;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg,timeg_c;
|
||||
|
||||
int test = 1;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops Time CTime Test\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
timeg_c=0;
|
||||
|
||||
fprintf(stderr, " %6d :", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
result = MIN_ (&m, x, &inc_x);
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg += time1;
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
result_c = min_c(m, x, inc_x);
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg_c += time1;
|
||||
|
||||
test &= assert_dbl_near(result, result_c, SINGLE_EPS);
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
timeg_c /= loops;
|
||||
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
@ -0,0 +1,303 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
#define SINGLE_EPS 1e-04
|
||||
#define DOUBLE_EPS 1e-13
|
||||
|
||||
int assert_dbl_near(double exp, double real, double tol) {
|
||||
double diff = exp - real;
|
||||
double absdiff = diff;
|
||||
/* avoid using fabs and linking with a math lib */
|
||||
if(diff < 0) {
|
||||
absdiff *= -1;
|
||||
}
|
||||
if (absdiff > tol) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifdef COMPLEX
|
||||
int zrot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT temp[2];
|
||||
BLASLONG inc_x2;
|
||||
BLASLONG inc_y2;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
inc_x2 = 2 * inc_x ;
|
||||
inc_y2 = 2 * inc_y ;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp[0] = c*x[ix] + s*y[iy] ;
|
||||
temp[1] = c*x[ix+1] + s*y[iy+1] ;
|
||||
y[iy] = c*y[iy] - s*x[ix] ;
|
||||
y[iy+1] = c*y[iy+1] - s*x[ix+1] ;
|
||||
x[ix] = temp[0] ;
|
||||
x[ix+1] = temp[1] ;
|
||||
|
||||
ix += inc_x2 ;
|
||||
iy += inc_y2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
#else
|
||||
int rot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT temp;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp = c*x[ix] + s*y[iy] ;
|
||||
y[iy] = c*y[iy] - s*x[ix] ;
|
||||
x[ix] = temp ;
|
||||
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#undef ROT
|
||||
#ifdef COMPLEX
|
||||
#ifdef DOUBLE
|
||||
#define ROT BLASFUNC(zdrot)
|
||||
#else
|
||||
#define ROT BLASFUNC(csrot)
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
#define ROT BLASFUNC(drot)
|
||||
#else
|
||||
#define ROT BLASFUNC(srot)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x, *y, *x_c, *y_c;
|
||||
// FLOAT result;
|
||||
blasint m, i;
|
||||
blasint inc_x=1,inc_y=1;
|
||||
FLOAT c[1] = { 2.0 };
|
||||
FLOAT s[1] = { 2.0 };
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg,timeg_c;
|
||||
|
||||
blasint ix,iy;
|
||||
int test = 1;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops Time CTime Test\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
timeg_c=0;
|
||||
|
||||
fprintf(stderr, " %6d :", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
x_c[i] = x[i];
|
||||
}
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
y_c[i] = y[i];
|
||||
}
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
ROT (&m, x, &inc_x, y, &inc_y, c, s);
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg += time1;
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
#ifdef COMPLEX
|
||||
zrot_c(m, x_c, inc_x, y_c, inc_y, *c, *s);
|
||||
#else
|
||||
rot_c(m, x_c, inc_x, y_c, inc_y, *c, *s);
|
||||
#endif
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg_c += time1;
|
||||
|
||||
ix = 0;
|
||||
iy = 0;
|
||||
#ifdef COMPLEX
|
||||
for (i = 0; i < m * 2; i++)
|
||||
#else
|
||||
for (i = 0; i < m; i++)
|
||||
#endif
|
||||
{
|
||||
test &= assert_dbl_near(x[ix], x_c[ix], SINGLE_EPS);
|
||||
test &= assert_dbl_near(y[iy], y_c[iy], SINGLE_EPS);
|
||||
ix += inc_x;
|
||||
iy += inc_y;
|
||||
}
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
timeg_c /= loops;
|
||||
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
@ -0,0 +1,308 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
#define SINGLE_EPS 1e-04
|
||||
#define DOUBLE_EPS 1e-13
|
||||
|
||||
int assert_dbl_near(double exp, double real, double tol) {
|
||||
double diff = exp - real;
|
||||
double absdiff = diff;
|
||||
/* avoid using fabs and linking with a math lib */
|
||||
if(diff < 0) {
|
||||
absdiff *= -1;
|
||||
}
|
||||
if (absdiff > tol) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifdef COMPLEX
|
||||
int zscal_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG inc_x2;
|
||||
BLASLONG ip = 0;
|
||||
FLOAT temp;
|
||||
|
||||
if ( (n <= 0) || (inc_x <= 0))
|
||||
return(0);
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
for ( i=0; i<n; i++ )
|
||||
{
|
||||
if ( da_r == 0.0 )
|
||||
{
|
||||
if ( da_i == 0.0 )
|
||||
{
|
||||
temp = 0.0;
|
||||
x[ip+1] = 0.0 ;
|
||||
}
|
||||
else
|
||||
{
|
||||
temp = - da_i * x[ip+1] ;
|
||||
x[ip+1] = da_i * x[ip] ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( da_i == 0.0 )
|
||||
{
|
||||
temp = da_r * x[ip] ;
|
||||
x[ip+1] = da_r * x[ip+1];
|
||||
}
|
||||
else
|
||||
{
|
||||
temp = da_r * x[ip] - da_i * x[ip+1] ;
|
||||
x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;
|
||||
}
|
||||
}
|
||||
x[ip] = temp;
|
||||
|
||||
ip += inc_x2;
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
#else
|
||||
int scal_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0,j=0;
|
||||
|
||||
if ( (n <= 0) || (inc_x <= 0))
|
||||
return(0);
|
||||
|
||||
|
||||
while(j < n)
|
||||
{
|
||||
|
||||
if ( da == 0.0 )
|
||||
x[i]=0.0;
|
||||
else
|
||||
x[i] = da * x[i] ;
|
||||
|
||||
i += inc_x ;
|
||||
j++;
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#undef SCAL
|
||||
#ifdef COMPLEX
|
||||
#ifdef DOUBLE
|
||||
#define SCAL BLASFUNC(zscal)
|
||||
#else
|
||||
#define SCAL BLASFUNC(cscal)
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
#define SCAL BLASFUNC(dscal)
|
||||
#else
|
||||
#define SCAL BLASFUNC(sscal)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x, *x_c;
|
||||
FLOAT alpha[2] = { 2.0, 2.0 };
|
||||
blasint m, i;
|
||||
blasint inc_x=1;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg,timeg_c;
|
||||
|
||||
blasint ix;
|
||||
int test = 1;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops Time CTime Test\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
timeg_c=0;
|
||||
|
||||
fprintf(stderr, " %6d :", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
x_c[i] = x[i];
|
||||
}
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
SCAL (&m, alpha, x, &inc_x);
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg += time1;
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
#ifdef COMPLEX
|
||||
zscal_c(m, 0, 0, alpha[0],alpha[1], x_c, inc_x, NULL, 0, NULL, 0);
|
||||
#else
|
||||
scal_c(m, 0, 0, *alpha, x_c, inc_x, NULL, 0, NULL, 0);
|
||||
#endif
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg_c += time1;
|
||||
|
||||
ix = 0;
|
||||
#ifdef COMPLEX
|
||||
for (i = 0; i < m * 2; i++)
|
||||
#else
|
||||
for (i = 0; i < m; i++)
|
||||
#endif
|
||||
{
|
||||
test &= assert_dbl_near(x[ix], x_c[ix], SINGLE_EPS);
|
||||
ix += inc_x;
|
||||
}
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
timeg_c /= loops;
|
||||
|
||||
#ifdef COMPLEX
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 6. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
#else
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
@ -0,0 +1,306 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above swapright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above swapright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE SWAPRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
#define SINGLE_EPS 1e-04
|
||||
#define DOUBLE_EPS 1e-13
|
||||
|
||||
int assert_dbl_near(double exp, double real, double tol) {
|
||||
double diff = exp - real;
|
||||
double absdiff = diff;
|
||||
/* avoid using fabs and linking with a math lib */
|
||||
if(diff < 0) {
|
||||
absdiff *= -1;
|
||||
}
|
||||
if (absdiff > tol) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifdef COMPLEX
|
||||
int zswap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT temp[2];
|
||||
BLASLONG inc_x2;
|
||||
BLASLONG inc_y2;
|
||||
|
||||
if ( n < 0 ) return(0);
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
temp[0] = x[ix] ;
|
||||
temp[1] = x[ix+1] ;
|
||||
x[ix] = y[iy] ;
|
||||
x[ix+1] = y[iy+1] ;
|
||||
y[iy] = temp[0] ;
|
||||
y[iy+1] = temp[1] ;
|
||||
|
||||
ix += inc_x2 ;
|
||||
iy += inc_y2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
#else
|
||||
int swap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT temp;
|
||||
|
||||
if ( n < 0 ) return(0);
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
temp = x[ix] ;
|
||||
x[ix] = y[iy] ;
|
||||
y[iy] = temp ;
|
||||
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#undef SWAP
|
||||
#ifdef COMPLEX
|
||||
#ifdef DOUBLE
|
||||
#define SWAP BLASFUNC(zswap)
|
||||
#else
|
||||
#define SWAP BLASFUNC(cswap)
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
#define SWAP BLASFUNC(dswap)
|
||||
#else
|
||||
#define SWAP BLASFUNC(sswap)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x, *y, *x_c, *y_c;
|
||||
blasint m, i;
|
||||
blasint inc_x=1,inc_y=1;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg,timeg_c;
|
||||
|
||||
blasint ix,iy;
|
||||
int test = 1;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops Time CTime Test\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
timeg_c=0;
|
||||
|
||||
fprintf(stderr, " %6d :", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
x_c[i] = x[i];
|
||||
}
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
y_c[i] = y[i];
|
||||
}
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
SWAP (&m, x, &inc_x, y, &inc_y );
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg += time1;
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
#ifdef COMPLEX
|
||||
zswap_c(m, 0, 0, 0, 0, x_c, inc_x, y_c, inc_y, NULL, 0);
|
||||
#else
|
||||
swap_c(m, 0, 0, 0, x_c, inc_x, y_c, inc_y, NULL, 0);
|
||||
#endif
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
timeg_c += time1;
|
||||
|
||||
ix = 0;
|
||||
iy = 0;
|
||||
#ifdef COMPLEX
|
||||
for (i = 0; i < m * 2; i++)
|
||||
#else
|
||||
for (i = 0; i < m; i++)
|
||||
#endif
|
||||
{
|
||||
test &= assert_dbl_near(x[ix], x_c[ix], SINGLE_EPS);
|
||||
test &= assert_dbl_near(y[ix], y_c[ix], SINGLE_EPS);
|
||||
ix += inc_x;
|
||||
iy += inc_y;
|
||||
}
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
timeg_c /= loops;
|
||||
|
||||
#ifdef COMPLEX
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 6. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
#else
|
||||
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
Loading…
Reference in New Issue