[ZARCH] Z14 support, BLAS 1/2 single precision implementations, Some missing double precision implementations, Gemv optimization

This commit is contained in:
maamountki 2018-08-06 18:20:40 +03:00 committed by GitHub
parent ee955757f9
commit 23229011db
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
79 changed files with 17314 additions and 2897 deletions

View File

@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector
FCOMMON_OPT += -march=z13 -mzvector
endif
ifeq ($(CORE), Z14)
CCOMMON_OPT += -march=z14 -mzvector
FCOMMON_OPT += -march=z14 -mzvector
endif

View File

@ -29,40 +29,25 @@
#define CPU_GENERIC 0
#define CPU_Z13 1
#define CPU_Z14 2
static char *cpuname[] = {
"ZARCH_GENERIC",
"Z13"
"Z13",
"Z14"
};
static char *cpuname_lower[] = {
"zarch_generic",
"z13"
"z13",
"z14"
};
int detect(void)
{
FILE *infile;
char buffer[512], *p;
p = (char *)NULL;
infile = fopen("/proc/sysinfo", "r");
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("Type", buffer, 4)){
p = strchr(buffer, ':') + 2;
#if 0
fprintf(stderr, "%s\n", p);
#endif
break;
}
}
fclose(infile);
if (strstr(p, "2964")) return CPU_Z13;
if (strstr(p, "2965")) return CPU_Z13;
return CPU_GENERIC;
// return CPU_GENERIC;
return CPU_Z14;
}
void get_libname(void)
@ -107,5 +92,9 @@ void get_cpuconfig(void)
printf("#define Z13\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
break;
case CPU_Z14:
printf("#define Z14\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
break;
}
}

View File

@ -1,18 +1,18 @@
SAMAXKERNEL = ../arm/amax.c
DAMAXKERNEL = ../arm/amax.c
DAMAXKERNEL = damax.c
CAMAXKERNEL = ../arm/zamax.c
ZAMAXKERNEL = ../arm/zamax.c
ZAMAXKERNEL = zamax.c
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
DAMINKERNEL = damin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
DMAXKERNEL = dmax.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
DMINKERNEL = dmin.c
ISAMAXKERNEL = ../arm/iamax.c
IDAMAXKERNEL = idamax.c
@ -25,10 +25,10 @@ ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = idmax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
IDMINKERNEL = idmin.c
SASUMKERNEL = ../arm/asum.c
DASUMKERNEL = dasum.c
@ -74,12 +74,12 @@ ZSWAPKERNEL = zswap.c
SGEMVNKERNEL = ../arm/gemv_n.c
DGEMVNKERNEL = dgemv_n_4.c
CGEMVNKERNEL = ../arm/zgemv_n.c
ZGEMVNKERNEL = zgemv_n_4.c
ZGEMVNKERNEL = ../arm/zgemv_n.c
SGEMVTKERNEL = ../arm/gemv_t.c
DGEMVTKERNEL = dgemv_t_4.c
CGEMVTKERNEL = ../arm/zgemv_t.c
ZGEMVTKERNEL = zgemv_t_4.c
ZGEMVTKERNEL = ../arm/zgemv_t.c
STRMMKERNEL = strmm8x4V.S
DTRMMKERNEL = trmm8x4V.S

146
kernel/zarch/KERNEL.Z14 Normal file
View File

@ -0,0 +1,146 @@
SAMAXKERNEL = samax.c
DAMAXKERNEL = damax.c
CAMAXKERNEL = camax.c
ZAMAXKERNEL = zamax.c
SAMINKERNEL = samin.c
DAMINKERNEL = damin.c
CAMINKERNEL = camin.c
ZAMINKERNEL = zamin.c
SMAXKERNEL = smax.c
DMAXKERNEL = dmax.c
SMINKERNEL = smin.c
DMINKERNEL = dmin.c
ISAMAXKERNEL = isamax.c
IDAMAXKERNEL = idamax.c
ICAMAXKERNEL = icamax.c
IZAMAXKERNEL = izamax.c
ISAMINKERNEL = isamin.c
IDAMINKERNEL = idamin.c
ICAMINKERNEL = icamin.c
IZAMINKERNEL = izamin.c
ISMAXKERNEL = ismax.c
IDMAXKERNEL = idmax.c
ISMINKERNEL = ismin.c
IDMINKERNEL = idmin.c
SASUMKERNEL = sasum.c
DASUMKERNEL = dasum.c
CASUMKERNEL = casum.c
ZASUMKERNEL = zasum.c
SAXPYKERNEL = saxpy.c
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = caxpy.c
ZAXPYKERNEL = zaxpy.c
SCOPYKERNEL = scopy.c
DCOPYKERNEL = dcopy.c
CCOPYKERNEL = ccopy.c
ZCOPYKERNEL = zcopy.c
SDOTKERNEL = sdot.c
DDOTKERNEL = ddot.c
CDOTKERNEL = cdot.c
ZDOTKERNEL = zdot.c
DSDOTKERNEL = dsdot.c
SNRM2KERNEL = ../arm/nrm2.c
DNRM2KERNEL = ../arm/nrm2.c
CNRM2KERNEL = ../arm/znrm2.c
ZNRM2KERNEL = ../arm/znrm2.c
SROTKERNEL = srot.c
DROTKERNEL = drot.c
CROTKERNEL = crot.c
ZROTKERNEL = zrot.c
SSCALKERNEL = sscal.c
DSCALKERNEL = dscal.c
CSCALKERNEL = cscal.c
ZSCALKERNEL = zscal.c
SSWAPKERNEL = sswap.c
DSWAPKERNEL = dswap.c
CSWAPKERNEL = cswap.c
ZSWAPKERNEL = zswap.c
SGEMVNKERNEL = sgemv_n_4.c
DGEMVNKERNEL = dgemv_n_4.c
CGEMVNKERNEL = ../arm/zgemv_n.c
ZGEMVNKERNEL = ../arm/zgemv_n.c
SGEMVTKERNEL = sgemv_t_4.c
DGEMVTKERNEL = dgemv_t_4.c
CGEMVTKERNEL = ../arm/zgemv_t.c
ZGEMVTKERNEL = ../arm/zgemv_t.c
STRMMKERNEL = strmm8x4V.S
DTRMMKERNEL = trmm8x4V.S
CTRMMKERNEL = ctrmm4x4V.S
ZTRMMKERNEL = ztrmm4x4V.S
SGEMMKERNEL = strmm8x4V.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = gemm8x4V.S
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = ctrmm4x4V.S
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = ztrmm4x4V.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

269
kernel/zarch/camax.c Normal file
View File

@ -0,0 +1,269 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT amax;
__asm__ volatile (
"vlef %%v0,0(%2),0 \n\t"
"vlef %%v16,4(%2),0 \n\t"
"vlef %%v0,8(%2),1 \n\t"
"vlef %%v16,12(%2),1 \n\t"
"vlef %%v0,16(%2),2 \n\t"
"vlef %%v16,20(%2),2 \n\t"
"vlef %%v0,24(%2),3 \n\t"
"vlef %%v16,28(%2),3 \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vflpsb %%v16,%%v16 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%2) \n\t"
"vlef %%v16,0(%%r1,%2),0 \n\t"
"vlef %%v17,4(%%r1,%2),0 \n\t"
"vlef %%v16,8(%%r1,%2),1 \n\t"
"vlef %%v17,12(%%r1,%2),1 \n\t"
"vlef %%v16,16(%%r1,%2),2 \n\t"
"vlef %%v17,20(%%r1,%2),2 \n\t"
"vlef %%v16,24(%%r1,%2),3 \n\t"
"vlef %%v17,28(%%r1,%2),3 \n\t"
"vlef %%v18,32(%%r1,%2),0 \n\t"
"vlef %%v19,36(%%r1,%2),0 \n\t"
"vlef %%v18,40(%%r1,%2),1 \n\t"
"vlef %%v19,44(%%r1,%2),1 \n\t"
"vlef %%v18,48(%%r1,%2),2 \n\t"
"vlef %%v19,52(%%r1,%2),2 \n\t"
"vlef %%v18,56(%%r1,%2),3 \n\t"
"vlef %%v19,30(%%r1,%2),3 \n\t"
"vlef %%v20,64(%%r1,%2),0 \n\t"
"vlef %%v21,68(%%r1,%2),0 \n\t"
"vlef %%v20,72(%%r1,%2),1 \n\t"
"vlef %%v21,76(%%r1,%2),1 \n\t"
"vlef %%v20,80(%%r1,%2),2 \n\t"
"vlef %%v21,84(%%r1,%2),2 \n\t"
"vlef %%v20,88(%%r1,%2),3 \n\t"
"vlef %%v21,92(%%r1,%2),3 \n\t"
"vlef %%v22,96(%%r1,%2),0 \n\t"
"vlef %%v23,100(%%r1,%2),0 \n\t"
"vlef %%v22,104(%%r1,%2),1 \n\t"
"vlef %%v23,108(%%r1,%2),1 \n\t"
"vlef %%v22,112(%%r1,%2),2 \n\t"
"vlef %%v23,116(%%r1,%2),2 \n\t"
"vlef %%v22,120(%%r1,%2),3 \n\t"
"vlef %%v23,124(%%r1,%2),3 \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchsb %%v24,%%v16,%%v17 \n\t"
"vfchsb %%v25,%%v18,%%v19 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vfchsb %%v26,%%v24,%%v25 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
"vfchsb %%v27,%%v26,%%v0 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
"vlef %%v16,128(%%r1,%2),0 \n\t"
"vlef %%v17,132(%%r1,%2),0 \n\t"
"vlef %%v16,136(%%r1,%2),1 \n\t"
"vlef %%v17,140(%%r1,%2),1 \n\t"
"vlef %%v16,144(%%r1,%2),2 \n\t"
"vlef %%v17,148(%%r1,%2),2 \n\t"
"vlef %%v16,152(%%r1,%2),3 \n\t"
"vlef %%v17,156(%%r1,%2),3 \n\t"
"vlef %%v18,160(%%r1,%2),0 \n\t"
"vlef %%v19,164(%%r1,%2),0 \n\t"
"vlef %%v18,168(%%r1,%2),1 \n\t"
"vlef %%v19,172(%%r1,%2),1 \n\t"
"vlef %%v18,176(%%r1,%2),2 \n\t"
"vlef %%v19,180(%%r1,%2),2 \n\t"
"vlef %%v18,184(%%r1,%2),3 \n\t"
"vlef %%v19,188(%%r1,%2),3 \n\t"
"vlef %%v20,192(%%r1,%2),0 \n\t"
"vlef %%v21,196(%%r1,%2),0 \n\t"
"vlef %%v20,200(%%r1,%2),1 \n\t"
"vlef %%v21,204(%%r1,%2),1 \n\t"
"vlef %%v20,208(%%r1,%2),2 \n\t"
"vlef %%v21,212(%%r1,%2),2 \n\t"
"vlef %%v20,216(%%r1,%2),3 \n\t"
"vlef %%v21,220(%%r1,%2),3 \n\t"
"vlef %%v22,224(%%r1,%2),0 \n\t"
"vlef %%v23,228(%%r1,%2),0 \n\t"
"vlef %%v22,232(%%r1,%2),1 \n\t"
"vlef %%v23,236(%%r1,%2),1 \n\t"
"vlef %%v22,240(%%r1,%2),2 \n\t"
"vlef %%v23,244(%%r1,%2),2 \n\t"
"vlef %%v22,248(%%r1,%2),3 \n\t"
"vlef %%v23,252(%%r1,%2),3 \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchsb %%v24,%%v16,%%v17 \n\t"
"vfchsb %%v25,%%v18,%%v19 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vfchsb %%v26,%%v24,%%v25 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
"vfchsb %%v27,%%v26,%%v0 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v16,%%v0,32 \n\t"
"vfchsb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"vrepf %%v16,%%v0,2 \n\t"
"wfchsb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ler %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);
return amax;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (maxf);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
maxf = camax_kernel_32(n1, x);
i = n1;
}
else
{
maxf=CABS1(x,0);
i++;
}
while (i < n) {
if (ABS(x[i*2]) > maxf) {
maxf = ABS(x[i*2]);
}
i++;
}
return (maxf);
} else {
inc_x2 = 2 * inc_x;
maxf=CABS1(x,0);
i += inc_x2;
j++;
BLASLONG n1 = (n - 1) & -4;
while (j < n1) {
if (CABS1(x,i) > maxf) {
maxf = CABS1(x,i);
}
if (CABS1(x,i+inc_x2) > maxf) {
maxf = CABS1(x,i+inc_x2);
}
if (CABS1(x,i+inc_x2*2) > maxf) {
maxf = CABS1(x,i+inc_x2*2);
}
if (CABS1(x,i+inc_x2*3) > maxf) {
maxf = CABS1(x,i+inc_x2*3);
}
i += inc_x2 * 4;
j += 4;
}
while (j < n) {
if (CABS1(x,i) > maxf) {
maxf = CABS1(x,i);
}
i += inc_x2;
j++;
}
return (maxf);
}
}

269
kernel/zarch/camin.c Normal file
View File

@ -0,0 +1,269 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT amin;
__asm__ volatile (
"vlef %%v0,0(%2),0 \n\t"
"vlef %%v16,4(%2),0 \n\t"
"vlef %%v0,8(%2),0 \n\t"
"vlef %%v16,12(%2),0 \n\t"
"vlef %%v0,16(%2),2 \n\t"
"vlef %%v16,20(%2),2 \n\t"
"vlef %%v0,24(%2),3 \n\t"
"vlef %%v16,28(%2),3 \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vflpsb %%v16,%%v16 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vlef %%v16,0(%%r1,%2),0 \n\t"
"vlef %%v17,4(%%r1,%2),0 \n\t"
"vlef %%v16,8(%%r1,%2),0 \n\t"
"vlef %%v17,12(%%r1,%2),0 \n\t"
"vlef %%v16,16(%%r1,%2),2 \n\t"
"vlef %%v17,20(%%r1,%2),2 \n\t"
"vlef %%v16,24(%%r1,%2),3 \n\t"
"vlef %%v17,28(%%r1,%2),3 \n\t"
"vlef %%v18,32(%%r1,%2),0 \n\t"
"vlef %%v19,36(%%r1,%2),0 \n\t"
"vlef %%v18,40(%%r1,%2),0 \n\t"
"vlef %%v19,44(%%r1,%2),0 \n\t"
"vlef %%v18,48(%%r1,%2),2 \n\t"
"vlef %%v19,52(%%r1,%2),2 \n\t"
"vlef %%v18,56(%%r1,%2),3 \n\t"
"vlef %%v19,30(%%r1,%2),3 \n\t"
"vlef %%v20,64(%%r1,%2),0 \n\t"
"vlef %%v21,68(%%r1,%2),0 \n\t"
"vlef %%v20,72(%%r1,%2),0 \n\t"
"vlef %%v21,76(%%r1,%2),0 \n\t"
"vlef %%v20,80(%%r1,%2),2 \n\t"
"vlef %%v21,84(%%r1,%2),2 \n\t"
"vlef %%v20,88(%%r1,%2),3 \n\t"
"vlef %%v21,92(%%r1,%2),3 \n\t"
"vlef %%v22,96(%%r1,%2),0 \n\t"
"vlef %%v23,100(%%r1,%2),0 \n\t"
"vlef %%v22,104(%%r1,%2),0 \n\t"
"vlef %%v23,108(%%r1,%2),0 \n\t"
"vlef %%v22,112(%%r1,%2),2 \n\t"
"vlef %%v23,116(%%r1,%2),2 \n\t"
"vlef %%v22,120(%%r1,%2),3 \n\t"
"vlef %%v23,124(%%r1,%2),3 \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchsb %%v24,%%v17,%%v16 \n\t"
"vfchsb %%v25,%%v19,%%v18 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vfchsb %%v26,%%v25,%%v24 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
"vfchsb %%v27,%%v0,%%v26 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
"vlef %%v16,128(%%r1,%2),0 \n\t"
"vlef %%v17,132(%%r1,%2),0 \n\t"
"vlef %%v16,136(%%r1,%2),0 \n\t"
"vlef %%v17,140(%%r1,%2),0 \n\t"
"vlef %%v16,144(%%r1,%2),2 \n\t"
"vlef %%v17,148(%%r1,%2),2 \n\t"
"vlef %%v16,152(%%r1,%2),3 \n\t"
"vlef %%v17,156(%%r1,%2),3 \n\t"
"vlef %%v18,160(%%r1,%2),0 \n\t"
"vlef %%v19,164(%%r1,%2),0 \n\t"
"vlef %%v18,168(%%r1,%2),0 \n\t"
"vlef %%v19,172(%%r1,%2),0 \n\t"
"vlef %%v18,176(%%r1,%2),2 \n\t"
"vlef %%v19,180(%%r1,%2),2 \n\t"
"vlef %%v18,184(%%r1,%2),3 \n\t"
"vlef %%v19,188(%%r1,%2),3 \n\t"
"vlef %%v20,192(%%r1,%2),0 \n\t"
"vlef %%v21,196(%%r1,%2),0 \n\t"
"vlef %%v20,200(%%r1,%2),0 \n\t"
"vlef %%v21,204(%%r1,%2),0 \n\t"
"vlef %%v20,208(%%r1,%2),2 \n\t"
"vlef %%v21,212(%%r1,%2),2 \n\t"
"vlef %%v20,216(%%r1,%2),3 \n\t"
"vlef %%v21,220(%%r1,%2),3 \n\t"
"vlef %%v22,224(%%r1,%2),0 \n\t"
"vlef %%v23,228(%%r1,%2),0 \n\t"
"vlef %%v22,232(%%r1,%2),0 \n\t"
"vlef %%v23,236(%%r1,%2),0 \n\t"
"vlef %%v22,240(%%r1,%2),2 \n\t"
"vlef %%v23,244(%%r1,%2),2 \n\t"
"vlef %%v22,248(%%r1,%2),3 \n\t"
"vlef %%v23,252(%%r1,%2),3 \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchsb %%v24,%%v17,%%v16 \n\t"
"vfchsb %%v25,%%v19,%%v18 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vfchsb %%v26,%%v25,%%v24 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
"vfchsb %%v27,%%v0,%%v26 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v16,%%v0,32 \n\t"
"vfchsb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"vrepf %%v16,%%v0,2 \n\t"
"wfchsb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ler %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);
return amin;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (minf);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
minf = camin_kernel_32(n1, x);
i = n1;
}
else
{
minf=CABS1(x,0);
i++;
}
while (i < n) {
if (ABS(x[i*2]) < minf) {
minf = ABS(x[i*2]);
}
i++;
}
return (minf);
} else {
inc_x2 = 2 * inc_x;
minf=CABS1(x,0);
i += inc_x2;
j++;
BLASLONG n1 = (n - 1) & -4;
while (j < n1) {
if (CABS1(x,i) < minf) {
minf = CABS1(x,i);
}
if (CABS1(x,i+inc_x2) < minf) {
minf = CABS1(x,i+inc_x2);
}
if (CABS1(x,i+inc_x2*2) < minf) {
minf = CABS1(x,i+inc_x2*2);
}
if (CABS1(x,i+inc_x2*3) < minf) {
minf = CABS1(x,i+inc_x2*3);
}
i += inc_x2 * 4;
j += 4;
}
while (j < n) {
if (CABS1(x,i) < minf) {
minf = CABS1(x,i);
}
i += inc_x2;
j++;
}
return (minf);
}
}

167
kernel/zarch/casum.c Normal file
View File

@ -0,0 +1,167 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT asum;
__asm__ (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vfasb %%v0,%%v0,%%v2 \n\t"
"vfasb %%v0,%%v0,%%v3 \n\t"
"veslg %%v1,%%v0,32 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vrepf %%v1,%%v0,2 \n\t"
"aebr %%f0,%%f1 \n\t"
"ler %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);
return asum;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ip=0;
FLOAT sumf = 0.0;
BLASLONG n1;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(sumf);
if ( inc_x == 1 )
{
n1 = n & -32;
if ( n1 > 0 )
{
sumf = casum_kernel_32(n1, x);
i=n1;
ip=2*n1;
}
while(i < n)
{
sumf += ABS(x[ip]) + ABS(x[ip+1]);
i++;
ip+=2;
}
}
else
{
inc_x2 = 2* inc_x;
while(i < n)
{
sumf += ABS(x[ip]) + ABS(x[ip+1]);
ip+=inc_x2;
i++;
}
}
return(sumf);
}

174
kernel/zarch/caxpy.c Normal file
View File

@ -0,0 +1,174 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
#if !defined(CONJ)
"vlrepf %%v0,0(%3) \n\t"
"vlef %%v1,4(%3),0 \n\t"
"vlef %%v1,4(%3),2 \n\t"
"vflcsb %%v1,%%v1 \n\t"
"vlef %%v1,4(%3),1 \n\t"
"vlef %%v1,4(%3),3 \n\t"
#else
"vlef %%v0,0(%3),1 \n\t"
"vlef %%v0,0(%3),3 \n\t"
"vflcsb %%v0,%%v0 \n\t"
"vlef %%v0,0(%3),0 \n\t"
"vlef %%v0,0(%3),2 \n\t"
"vlrepf %%v1,4(%3) \n\t"
#endif
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,0(%%r1,%2) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,32(%%r1,%2) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"verllg %%v24,%%v16,32 \n\t"
"verllg %%v25,%%v17,32 \n\t"
"verllg %%v26,%%v18,32 \n\t"
"verllg %%v27,%%v19,32 \n\t"
"vfmasb %%v28,%%v16,%%v0,%%v20 \n\t"
"vfmasb %%v29,%%v17,%%v0,%%v21 \n\t"
"vfmasb %%v30,%%v18,%%v0,%%v22 \n\t"
"vfmasb %%v31,%%v19,%%v0,%%v23 \n\t"
"vfmasb %%v28,%%v24,%%v1,%%v28 \n\t"
"vfmasb %%v29,%%v25,%%v1,%%v29 \n\t"
"vfmasb %%v30,%%v26,%%v1,%%v30 \n\t"
"vfmasb %%v31,%%v27,%%v1,%%v31 \n\t"
"vst %%v28,0(%%r1,%2) \n\t"
"vst %%v29,16(%%r1,%2) \n\t"
"vst %%v30,32(%%r1,%2) \n\t"
"vst %%v31,48(%%r1,%2) \n\t"
"vl %%v16,64(%%r1,%1) \n\t"
"vl %%v17,80(%%r1,%1) \n\t"
"vl %%v18,96(%%r1,%1) \n\t"
"vl %%v19,112(%%r1,%1) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"verllg %%v24,%%v16,32 \n\t"
"verllg %%v25,%%v17,32 \n\t"
"verllg %%v26,%%v18,32 \n\t"
"verllg %%v27,%%v19,32 \n\t"
"vfmasb %%v28,%%v16,%%v0,%%v20 \n\t"
"vfmasb %%v29,%%v17,%%v0,%%v21 \n\t"
"vfmasb %%v30,%%v18,%%v0,%%v22 \n\t"
"vfmasb %%v31,%%v19,%%v0,%%v23 \n\t"
"vfmasb %%v28,%%v24,%%v1,%%v28 \n\t"
"vfmasb %%v29,%%v25,%%v1,%%v29 \n\t"
"vfmasb %%v30,%%v26,%%v1,%%v30 \n\t"
"vfmasb %%v31,%%v27,%%v1,%%v31 \n\t"
"vst %%v28,64(%%r1,%2) \n\t"
"vst %%v29,80(%%r1,%2) \n\t"
"vst %%v30,96(%%r1,%2) \n\t"
"vst %%v31,112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT da[2];
if (n <= 0) return (0);
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -16;
if (n1) {
da[0] = da_r;
da[1] = da_i;
caxpy_kernel_16(n1, x, y, da);
ix = 2 * n1;
}
i = n1;
while (i < n) {
#if !defined(CONJ)
y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
#else
y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
#endif
i++;
ix += 2;
}
return (0);
}
inc_x *= 2;
inc_y *= 2;
while (i < n) {
#if !defined(CONJ)
y[iy] += (da_r * x[ix] - da_i * x[ix + 1]);
y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
#else
y[iy] += (da_r * x[ix] + da_i * x[ix + 1]);
y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
#endif
ix += inc_x;
iy += inc_y;
i++;
}
return (0);
}

99
kernel/zarch/ccopy.c Normal file
View File

@ -0,0 +1,99 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,5 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1) \n\t"
"pfd 2, 1024(%%r2) \n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t"
"agfi %%r1,256 \n\t"
"agfi %%r2,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","r2"
);
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
ccopy_kernel_32(n1, x, y);
i=n1;
ix=n1*2;
iy=n1*2;
}
while(i < n)
{
y[iy] = x[iy] ;
y[iy+1] = x[ix+1] ;
ix+=2;
iy+=2;
i++ ;
}
}
else
{
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y;
while(i < n)
{
y[iy] = x[ix] ;
y[iy+1] = x[ix+1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
}
return(0);
}

182
kernel/zarch/cdot.c Normal file
View File

@ -0,0 +1,182 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"vzero %%v28 \n\t"
"vzero %%v29 \n\t"
"vzero %%v30 \n\t"
"vzero %%v31 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"verllg %%v20,%%v16,32 \n\t"
"verllg %%v21,%%v17,32 \n\t"
"verllg %%v22,%%v18,32 \n\t"
"verllg %%v23,%%v19,32 \n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t"
"vfmasb %%v25,%%v20,%%v0,%%v25 \n\t"
"vfmasb %%v26,%%v17,%%v1,%%v26 \n\t"
"vfmasb %%v27,%%v21,%%v1,%%v27 \n\t"
"vfmasb %%v28,%%v18,%%v2,%%v28 \n\t"
"vfmasb %%v29,%%v22,%%v2,%%v29 \n\t"
"vfmasb %%v30,%%v19,%%v3,%%v30 \n\t"
"vfmasb %%v31,%%v23,%%v3,%%v31 \n\t"
"vl %%v16, 64(%%r1,%1) \n\t"
"vl %%v17, 80(%%r1,%1) \n\t"
"vl %%v18, 96(%%r1,%1) \n\t"
"vl %%v19, 112(%%r1,%1) \n\t"
"vl %%v0, 64(%%r1,%2) \n\t"
"vl %%v1, 80(%%r1,%2) \n\t"
"vl %%v2, 96(%%r1,%2) \n\t"
"vl %%v3, 112(%%r1,%2) \n\t"
"verllg %%v20,%%v16,32 \n\t"
"verllg %%v21,%%v17,32 \n\t"
"verllg %%v22,%%v18,32 \n\t"
"verllg %%v23,%%v19,32 \n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t"
"vfmasb %%v25,%%v20,%%v0,%%v25 \n\t"
"vfmasb %%v26,%%v17,%%v1,%%v26 \n\t"
"vfmasb %%v27,%%v21,%%v1,%%v27 \n\t"
"vfmasb %%v28,%%v18,%%v2,%%v28 \n\t"
"vfmasb %%v29,%%v22,%%v2,%%v29 \n\t"
"vfmasb %%v30,%%v19,%%v3,%%v30 \n\t"
"vfmasb %%v31,%%v23,%%v3,%%v31 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"vfasb %%v24,%%v24,%%v26 \n\t"
"vfasb %%v24,%%v24,%%v28 \n\t"
"vfasb %%v24,%%v24,%%v30 \n\t"
"vrepg %%v26,%%v24,1 \n\t"
"vfasb %%v24,%%v24,%%v26 \n\t"
"vfasb %%v25,%%v25,%%v27 \n\t"
"vfasb %%v25,%%v25,%%v29 \n\t"
"vfasb %%v25,%%v25,%%v31 \n\t"
"vrepg %%v27,%%v25,1 \n\t"
"vfasb %%v25,%%v25,%%v27 \n\t"
"vstef %%v24,0(%3),0 \n\t"
"vstef %%v24,4(%3),1 \n\t"
"vstef %%v25,8(%3),1 \n\t"
"vstef %%v25,12(%3),0 "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i;
BLASLONG ix, iy;
OPENBLAS_COMPLEX_FLOAT result;
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
if (n <= 0) {
CREAL(result) = 0.0;
CIMAG(result) = 0.0;
return (result);
}
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -16;
if (n1)
cdot_kernel_16(n1, x, y, dot);
i = n1;
BLASLONG j = i * 2;
while (i < n) {
dot[0] += x[j] * y[j];
dot[1] += x[j + 1] * y[j + 1];
dot[2] += x[j] * y[j + 1];
dot[3] += x[j + 1] * y[j];
j += 2;
i++;
}
} else {
i = 0;
ix = 0;
iy = 0;
inc_x <<= 1;
inc_y <<= 1;
while (i < n) {
dot[0] += x[ix] * y[iy];
dot[1] += x[ix + 1] * y[iy + 1];
dot[2] += x[ix] * y[iy + 1];
dot[3] += x[ix + 1] * y[iy];
ix += inc_x;
iy += inc_y;
i++;
}
}
#if !defined(CONJ)
CREAL(result) = dot[0] - dot[1];
CIMAG(result) = dot[2] + dot[3];
#else
CREAL(result) = dot[0] + dot[1];
CIMAG(result) = dot[2] - dot[3];
#endif
return (result);
}

256
kernel/zarch/crot.c Normal file
View File

@ -0,0 +1,256 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
__asm__ (
"vlrepf %%v0,%3 \n\t"
"vlrepf %%v1,%4 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%1) \n\t"
"vst %%v29, 16(%%r1,%1) \n\t"
"vst %%v30, 32(%%r1,%1) \n\t"
"vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v23, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%1) \n\t"
"vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%1) \n\t"
"vst %%v29, 144(%%r1,%1) \n\t"
"vst %%v30, 160(%%r1,%1) \n\t"
"vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v23, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];
BLASLONG inc_x2;
BLASLONG inc_y2;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
FLOAT cosa,sina;
cosa=c;
sina=s;
crot_kernel_32(n1, x, y, &cosa, &sina);
i=n1;
ix=2*n1;
}
while(i < n)
{
temp[0] = c*x[ix] + s*y[ix] ;
temp[1] = c*x[ix+1] + s*y[ix+1] ;
y[ix] = c*y[ix] - s*x[ix] ;
y[ix+1] = c*y[ix+1] - s*x[ix+1] ;
x[ix] = temp[0] ;
x[ix+1] = temp[1] ;
ix += 2 ;
i++ ;
}
}
else
{
inc_x2 = 2 * inc_x ;
inc_y2 = 2 * inc_y ;
while(i < n)
{
temp[0] = c*x[ix] + s*y[iy] ;
temp[1] = c*x[ix+1] + s*y[iy+1] ;
y[iy] = c*y[iy] - s*x[ix] ;
y[iy+1] = c*y[iy+1] - s*x[ix+1] ;
x[ix] = temp[0] ;
x[ix+1] = temp[1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
}
return(0);
}

456
kernel/zarch/cscal.c Normal file
View File

@ -0,0 +1,456 @@
/***************************************************************************
Copyright (c) 2013 - 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlrepf %%v0,0(%1) \n\t"
"vlef %%v1,4(%1),0 \n\t"
"vlef %%v1,4(%1),2 \n\t"
"vflcsb %%v1,%%v1 \n\t"
"vlef %%v1,4(%1),1 \n\t"
"vlef %%v1,4(%1),3 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"verllg %%v24,%%v16,32 \n\t"
"verllg %%v25,%%v17,32 \n\t"
"verllg %%v26,%%v18,32 \n\t"
"verllg %%v27,%%v19,32 \n\t"
"verllg %%v28,%%v20,32 \n\t"
"verllg %%v29,%%v21,32 \n\t"
"verllg %%v30,%%v22,32 \n\t"
"verllg %%v31,%%v23,32 \n\t"
"vfmsb %%v16,%%v16,%%v0 \n\t"
"vfmsb %%v17,%%v17,%%v0 \n\t"
"vfmsb %%v18,%%v18,%%v0 \n\t"
"vfmsb %%v19,%%v19,%%v0 \n\t"
"vfmsb %%v20,%%v20,%%v0 \n\t"
"vfmsb %%v21,%%v21,%%v0 \n\t"
"vfmsb %%v22,%%v22,%%v0 \n\t"
"vfmsb %%v23,%%v23,%%v0 \n\t"
"vfmasb %%v16,%%v24,%%v1,%%v16 \n\t"
"vfmasb %%v17,%%v25,%%v1,%%v17 \n\t"
"vfmasb %%v18,%%v26,%%v1,%%v18 \n\t"
"vfmasb %%v19,%%v27,%%v1,%%v19 \n\t"
"vfmasb %%v20,%%v28,%%v1,%%v20 \n\t"
"vfmasb %%v21,%%v29,%%v1,%%v21 \n\t"
"vfmasb %%v22,%%v30,%%v1,%%v22 \n\t"
"vfmasb %%v23,%%v31,%%v1,%%v23 \n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlef %%v0,4(%1),0 \n\t"
"vlef %%v0,4(%1),2 \n\t"
"vflcsb %%v0,%%v0 \n\t"
"vlef %%v0,4(%1),1 \n\t"
"vlef %%v0,4(%1),3 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"verllg %%v16,%%v16,32 \n\t"
"verllg %%v17,%%v17,32 \n\t"
"verllg %%v18,%%v18,32 \n\t"
"verllg %%v19,%%v19,32 \n\t"
"verllg %%v20,%%v20,32 \n\t"
"verllg %%v21,%%v21,32 \n\t"
"verllg %%v22,%%v22,32 \n\t"
"verllg %%v23,%%v23,32 \n\t"
"vfmsb %%v16,%%v16,%%v0 \n\t"
"vfmsb %%v17,%%v17,%%v0 \n\t"
"vfmsb %%v18,%%v18,%%v0 \n\t"
"vfmsb %%v19,%%v19,%%v0 \n\t"
"vfmsb %%v20,%%v20,%%v0 \n\t"
"vfmsb %%v21,%%v21,%%v0 \n\t"
"vfmsb %%v22,%%v22,%%v0 \n\t"
"vfmsb %%v23,%%v23,%%v0 \n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
}
static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlrepf %%v0,0(%1) \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfmsb %%v16,%%v16,%%v0 \n\t"
"vfmsb %%v17,%%v17,%%v0 \n\t"
"vfmsb %%v18,%%v18,%%v0 \n\t"
"vfmsb %%v19,%%v19,%%v0 \n\t"
"vfmsb %%v20,%%v20,%%v0 \n\t"
"vfmsb %%v21,%%v21,%%v0 \n\t"
"vfmsb %%v22,%%v22,%%v0 \n\t"
"vfmsb %%v23,%%v23,%%v0 \n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
}
static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"vst %%v24,0(%%r1,%1) \n\t"
"vst %%v25,16(%%r1,%1) \n\t"
"vst %%v26,32(%%r1,%1) \n\t"
"vst %%v27,48(%%r1,%1) \n\t"
"vst %%v24,64(%%r1,%1) \n\t"
"vst %%v25,80(%%r1,%1) \n\t"
"vst %%v26,96(%%r1,%1) \n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v24","v25","v26","v27"
);
}
static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i;
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_x3 = inc_x2 + inc_x;
FLOAT t0, t1, t2, t3;
FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1];
for (i = 0; i < n; i += 4)
{
t0 = da_r * x[0] - da_i * x[1];
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1];
x[1] = da_i * x[0] + da_r * x[1];
x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1];
x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1];
x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1];
x[0] = t0;
x[inc_x] = t1;
x[inc_x2] = t2;
x[inc_x3] = t3;
x += 4 * inc_x;
}
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0, j = 0;
FLOAT temp0;
FLOAT temp1;
FLOAT alpha[2] __attribute__ ((aligned(16)));
if (inc_x != 1) {
inc_x <<= 1;
if (da_r == 0.0) {
BLASLONG n1 = n & -2;
if (da_i == 0.0) {
while (j < n1) {
x[i] = 0.0;
x[i + 1] = 0.0;
x[i + inc_x] = 0.0;
x[i + 1 + inc_x] = 0.0;
i += 2 * inc_x;
j += 2;
}
while (j < n) {
x[i] = 0.0;
x[i + 1] = 0.0;
i += inc_x;
j++;
}
} else {
while (j < n1) {
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
temp1 = -da_i * x[i + 1 + inc_x];
x[i + 1 + inc_x] = da_i * x[i + inc_x];
x[i + inc_x] = temp1;
i += 2 * inc_x;
j += 2;
}
while (j < n) {
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;
}
}
} else {
if (da_i == 0.0) {
BLASLONG n1 = n & -2;
while (j < n1) {
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
temp1 = da_r * x[i + inc_x];
x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x];
x[i + inc_x] = temp1;
i += 2 * inc_x;
j += 2;
}
while (j < n) {
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
i += inc_x;
j++;
}
} else {
BLASLONG n1 = n & -8;
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
cscal_kernel_inc_8(n1, alpha, x, inc_x);
j = n1;
i = n1 * inc_x;
}
while (j < n) {
temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;
}
}
}
return (0);
}
BLASLONG n1 = n & -16;
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
if (da_r == 0.0)
if (da_i == 0)
cscal_kernel_16_zero(n1, x);
else
cscal_kernel_16_zero_r(n1, alpha, x);
else
if (da_i == 0)
cscal_kernel_16_zero_i(n1, alpha, x);
else
cscal_kernel_16(n1, alpha, x);
i = n1 << 1;
j = n1;
}
if (da_r == 0.0) {
if (da_i == 0.0) {
while (j < n) {
x[i] = 0.0;
x[i + 1] = 0.0;
i += 2;
j++;
}
} else {
while (j < n) {
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += 2;
j++;
}
}
} else {
if (da_i == 0.0) {
while (j < n) {
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
i += 2;
j++;
}
} else {
while (j < n) {
temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += 2;
j++;
}
}
}
return (0);
}

183
kernel/zarch/cswap.c Normal file
View File

@ -0,0 +1,183 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"
"vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v1, 144(%%r1,%2) \n\t"
"vl %%v2, 160(%%r1,%2) \n\t"
"vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v6, 224(%%r1,%2) \n\t"
"vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vst %%v2, 160(%%r1,%1) \n\t"
"vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vst %%v7, 240(%%r1,%1) \n\t"
"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];
BLASLONG inc_x2, inc_y2;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
cswap_kernel_32(n1, x, y);
i=n1;
ix = 2* n1;
iy = 2* n1;
}
while(i < n)
{
temp[0] = x[ix] ;
temp[1] = x[ix+1] ;
x[ix] = y[iy] ;
x[ix+1] = y[iy+1] ;
y[iy] = temp[0] ;
y[iy+1] = temp[1] ;
ix += 2 ;
iy += 2 ;
i++ ;
}
}
else
{
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
while(i < n)
{
temp[0] = x[ix] ;
temp[1] = x[ix+1] ;
x[ix] = y[iy] ;
x[ix+1] = y[iy+1] ;
y[iy] = temp[0] ;
y[iy+1] = temp[1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
}
return(0);
}

206
kernel/zarch/damax.c Normal file
View File

@ -0,0 +1,206 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT amax;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"vflpdb %%v0,%%v0 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vfchdb %%v26,%%v20,%%v21 \n\t"
"vfchdb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v24,%%v25 \n\t"
"vfchdb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vfchdb %%v26,%%v20,%%v21 \n\t"
"vfchdb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v24,%%v25 \n\t"
"vfchdb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amax;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
maxf = damax_kernel_32(n1, x);
i = n1;
}
else
{
maxf=ABS(x[0]);
i++;
}
while (i < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i++;
}
return (maxf);
} else {
maxf=ABS(x[0]);
i += inc_x;
j++;
BLASLONG n1 = (n - 1) & -4;
while (j < n1) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
maxf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (maxf);
}
}

206
kernel/zarch/damin.c Normal file
View File

@ -0,0 +1,206 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT amin;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"vflpdb %%v0,%%v0 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vfchdb %%v26,%%v21,%%v20 \n\t"
"vfchdb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v25,%%v24 \n\t"
"vfchdb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vfchdb %%v26,%%v21,%%v20 \n\t"
"vfchdb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v25,%%v24 \n\t"
"vfchdb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amin;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
minf = damin_kernel_32(n1, x);
i = n1;
}
else
{
minf=ABS(x[0]);
i++;
}
while (i < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i++;
}
return (minf);
} else {
minf=ABS(x[0]);
i += inc_x;
j++;
BLASLONG n1 = (n - 1) & -4;
while (j < n1) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
minf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (minf);
}
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -23,8 +23,7 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
*****************************************************************************/
#include "common.h"
#include <math.h>
@ -35,80 +34,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ABS fabsf
#endif
static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT asum;
__asm__ (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"
static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT asum ;
__asm__ (
"pfd 1, 0(%[ptr_x]) \n\t"
"sllg %%r0,%[n],3 \n\t"
"agr %%r0,%[ptr_x] \n\t"
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%[ptr_temp] ) \n\t"
"vlm %%v24,%%v31, 0(%[ptr_temp] ) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v0,%%v0,%%v24 \n\t"
"vfadb %%v1,%%v1,%%v25 \n\t"
"vfadb %%v2,%%v2,%%v26 \n\t"
"vfadb %%v3,%%v3,%%v27 \n\t"
"vfadb %%v0,%%v0,%%v28 \n\t"
"vfadb %%v1,%%v1,%%v29 \n\t"
"vfadb %%v2,%%v2,%%v30 \n\t"
"vfadb %%v3,%%v3,%%v31 \n\t"
"vlm %%v24,%%v31, 128(%[ptr_temp]) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"la %[ptr_temp],256(%[ptr_temp]) \n\t"
"vfadb %%v0,%%v0,%%v24 \n\t"
"vfadb %%v1,%%v1,%%v25 \n\t"
"vfadb %%v2,%%v2,%%v26 \n\t"
"vfadb %%v3,%%v3,%%v27 \n\t"
"vfadb %%v0,%%v0,%%v28 \n\t"
"vfadb %%v1,%%v1,%%v29 \n\t"
"vfadb %%v2,%%v2,%%v30 \n\t"
"vfadb %%v3,%%v3,%%v31 \n\t"
"clgrjl %[ptr_temp],%%r0,1b \n\t"
"vfadb %%v24,%%v0,%%v1 \n\t"
"vfadb %%v25,%%v2,%%v3 \n\t"
"vfadb %%v0,%%v25,%%v24 \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %[asum],%%f0 \n\t"
: [asum] "=f"(asum),[ptr_temp] "+&a"(x)
: [mem] "m"( *(const double (*)[n])x ), [n] "r"(n), [ptr_x] "a"(x)
: "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31"
);
return asum;
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vfadb %%v2,%%v2,%%v22 \n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vfadb %%v2,%%v2,%%v22 \n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b \n\t"
"vfadb %%v0,%%v0,%%v1 \n\t"
"vfadb %%v0,%%v0,%%v2 \n\t"
"vfadb %%v0,%%v0,%%v3 \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);
return asum;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;

View File

@ -25,98 +25,99 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#define PREFETCH_INS 1
#if defined(Z13_A)
#include <vecintrin.h>
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
{
BLASLONG i = 0;
__vector double v_a = {alpha,alpha};
__vector double * v_y=(__vector double *)y;
__vector double * v_x=(__vector double *)x;
for(; i<n/2; i+=16){
v_y[i] += v_a * v_x[i];
v_y[i+1] += v_a * v_x[i+1];
v_y[i+2] += v_a * v_x[i+2];
v_y[i+3] += v_a * v_x[i+3];
v_y[i+4] += v_a * v_x[i+4];
v_y[i+5] += v_a * v_x[i+5];
v_y[i+6] += v_a * v_x[i+6];
v_y[i+7] += v_a * v_x[i+7];
v_y[i+8] += v_a * v_x[i+8];
v_y[i+9] += v_a * v_x[i+9];
v_y[i+10] += v_a * v_x[i+10];
v_y[i+11] += v_a * v_x[i+11];
v_y[i+12] += v_a * v_x[i+12];
v_y[i+13] += v_a * v_x[i+13];
v_y[i+14] += v_a * v_x[i+14];
v_y[i+15] += v_a * v_x[i+15];
}
}
#else
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
"vlrepg %%v0,%3 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,0(%%r1,%2) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,32(%%r1,%2) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
__asm__ volatile(
#if defined(PREFETCH_INS)
"pfd 1, 0(%[x_tmp]) \n\t"
"pfd 2, 0(%[y_tmp]) \n\t"
#endif
"lgdr %%r0,%[alpha] \n\t"
"vlvgp %%v0,%%r0,%%r0 \n\t"
"srlg %%r0,%[n],5 \n\t"
"vlr %%v1,%%v0 \n\t"
".align 16 \n\t"
"1: \n\t"
#if defined(PREFETCH_INS)
"pfd 1, 256(%[x_tmp]) \n\t"
"pfd 2, 256(%[y_tmp]) \n\t"
#endif
"vlm %%v16,%%v23, 0(%[x_tmp]) \n\t"
"vlm %%v24, %%v31, 0(%[y_tmp]) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
"vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
"vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
"vfmadb %%v20,%%v0,%%v20,%%v28 \n\t"
"vfmadb %%v21,%%v1,%%v21,%%v29 \n\t"
"vfmadb %%v22,%%v0,%%v22,%%v30 \n\t"
"vfmadb %%v23,%%v1,%%v23,%%v31 \n\t"
"vstm %%v16,%%v23, 0(%[y_tmp]) \n\t"
"vlm %%v24,%%v31, 128(%[x_tmp]) \n\t"
"vlm %%v16,%%v23, 128(%[y_tmp]) \n\t"
"vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
"vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
"vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
"vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
"vfmadb %%v28,%%v0,%%v28,%%v20 \n\t"
"vfmadb %%v29,%%v1,%%v29,%%v21 \n\t"
"vfmadb %%v30,%%v0,%%v30,%%v22 \n\t"
"vfmadb %%v31,%%v1,%%v31,%%v23 \n\t"
"la %[x_tmp],256(%[x_tmp]) \n\t"
"vstm %%v24, %%v31, 128(%[y_tmp]) \n\t"
"la %[y_tmp],256(%[y_tmp]) \n\t"
"brctg %%r0,1b"
: [mem_y] "+m" (*(double (*)[n])y), [x_tmp] "+&a"(x), [y_tmp] "+&a"(y)
: [mem_x] "m" (*(const double (*)[n])x), [n] "r"(n), [alpha] "f"(alpha)
:"cc", "r0", "v0","v1","v16","v17","v18","v19","v20","v21",
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"vfmadb %%v16,%%v0,%%v16,%%v20 \n\t"
"vfmadb %%v17,%%v0,%%v17,%%v21 \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v22 \n\t"
"vfmadb %%v19,%%v0,%%v19,%%v23 \n\t"
"vl %%v24,64(%%r1,%1) \n\t"
"vl %%v25,80(%%r1,%1) \n\t"
"vl %%v26,96(%%r1,%1) \n\t"
"vl %%v27,112(%%r1,%1) \n\t"
"vl %%v28,64(%%r1,%2) \n\t"
"vl %%v29,80(%%r1,%2) \n\t"
"vl %%v30,96(%%r1,%2) \n\t"
"vl %%v31,112(%%r1,%2) \n\t"
"vfmadb %%v20,%%v0,%%v24,%%v28 \n\t"
"vfmadb %%v21,%%v0,%%v25,%%v29 \n\t"
"vfmadb %%v22,%%v0,%%v26,%%v30 \n\t"
"vfmadb %%v23,%%v0,%%v27,%%v31 \n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"vl %%v16,128(%%r1,%1) \n\t"
"vl %%v17,144(%%r1,%1) \n\t"
"vl %%v18,160(%%r1,%1) \n\t"
"vl %%v19,176(%%r1,%1) \n\t"
"vl %%v20,128(%%r1,%2) \n\t"
"vl %%v21,144(%%r1,%2) \n\t"
"vl %%v22,160(%%r1,%2) \n\t"
"vl %%v23,176(%%r1,%2) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v20 \n\t"
"vfmadb %%v17,%%v0,%%v17,%%v21 \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v22 \n\t"
"vfmadb %%v19,%%v0,%%v19,%%v23 \n\t"
"vl %%v24,192(%%r1,%1) \n\t"
"vl %%v25,208(%%r1,%1) \n\t"
"vl %%v26,224(%%r1,%1) \n\t"
"vl %%v27,240(%%r1,%1) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmadb %%v20,%%v0,%%v24,%%v28 \n\t"
"vfmadb %%v21,%%v0,%%v25,%%v29 \n\t"
"vfmadb %%v22,%%v0,%%v26,%%v30 \n\t"
"vfmadb %%v23,%%v0,%%v27,%%v31 \n\t"
"vst %%v16,128(%%r1,%2) \n\t"
"vst %%v17,144(%%r1,%2) \n\t"
"vst %%v18,160(%%r1,%2) \n\t"
"vst %%v19,176(%%r1,%2) \n\t"
"vst %%v20,192(%%r1,%2) \n\t"
"vst %%v21,208(%%r1,%2) \n\t"
"vst %%v22,224(%%r1,%2) \n\t"
"vst %%v23,240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
@ -131,7 +132,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
BLASLONG n1 = n & -32;
if ( n1 )
daxpy_kernel_32(n1, x, y , da );
daxpy_kernel_32(n1, x, y , &da);
i = n1;
while(i < n)

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -23,95 +23,28 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
*****************************************************************************/
#include "common.h"
#if defined(Z13mvc)
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__ volatile(
"pfd 1, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],5 \n\t"
".align 16 \n\t"
"1: \n\t"
"mvc 0(256,%[ptr_y]),0(%[ptr_x]) \n\t"
"la %[ptr_x],256(%[ptr_x]) \n\t"
"la %[ptr_y],256(%[ptr_y]) \n\t"
"brctg %[n_tmp],1b"
: [mem_y] "=m" (*(double (*)[n])y), [n_tmp] "+&r"(n),
[ptr_x] "+&a"(x), [ptr_y] "+&a"(y)
: [mem_x] "m" (*(const double (*)[n])x)
: "cc"
);
return;
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,5 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1) \n\t"
"pfd 2, 1024(%%r2) \n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t"
"agfi %%r1,256 \n\t"
"agfi %%r2,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","r2"
);
}
#else
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__ volatile(
"pfd 1, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],5 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 64(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 64(%%r1,%[ptr_y]) \n\t"
"vl %%v25, 80(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 80(%%r1,%[ptr_y]) \n\t"
"vl %%v26, 96(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 96(%%r1,%[ptr_y]) \n\t"
"vl %%v27, 112(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 112(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 192(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 192(%%r1,%[ptr_y]) \n\t"
"vl %%v25, 208(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 208(%%r1,%[ptr_y]) \n\t"
"vl %%v26, 224(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 224(%%r1,%[ptr_y]) \n\t"
"vl %%v27, 240(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 240(%%r1,%[ptr_y]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %[n_tmp],1b"
: [mem_y] "=m" (*(double (*)[n])y), [n_tmp] "+&r"(n)
: [mem_x] "m" (*(const double (*)[n])x), [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "r1", "v24","v25","v26","v27"
);
return;
}
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
@ -136,21 +69,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
} else {
BLASLONG n1 = n & -4;
while (i < n1) {
y[iy] = x[ix];
y[iy + inc_y] = x[ix + inc_x];
y[iy + 2 * inc_y] = x[ix + 2 * inc_x];
y[iy + 3 * inc_y] = x[ix + 3 * inc_x];
ix += inc_x * 4;
iy += inc_y * 4;
i += 4;
}
while (i < n) {
y[iy] = x[ix];
@ -165,5 +83,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
}

View File

@ -25,116 +25,59 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(Z13)
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
FLOAT dot;
__asm__ volatile(
"pfd 1, 0(%[ptr_x_tmp]) \n\t"
"pfd 1, 0(%[ptr_y_tmp]) \n\t"
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %[n_tmp],%[n_tmp],4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t"
"pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
"vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
"vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
"vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
"vl %%v16, 64(%%r1 ,%[ptr_x_tmp]) \n\t"
"vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v19, 112(%%r1,%[ptr_x_tmp]) \n\t"
__asm__ volatile (
"vzero %%v0 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"
"vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
"vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
"vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
"vl %%v31, 112(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
"la %%r1,128(%%r1) \n\t"
"brctg %[n_tmp],1b \n\t"
"vfadb %%v24,%%v25,%%v24 \n\t"
"vfadb %%v24,%%v26,%%v24 \n\t"
"vfadb %%v24,%%v27,%%v24 \n\t"
"vrepg %%v1,%%v24,1 \n\t"
"vfadb %%v1,%%v24,%%v1 \n\t"
"ldr %[dot], %%f1 \n\t"
: [dot] "=f"(dot) ,[n_tmp] "+&r"(n)
: [mem_x] "m"( *(const double (*)[n])x),
[mem_y] "m"( *(const double (*)[n])y),
[ptr_x_tmp]"a"(x), [ptr_y_tmp] "a"(y)
:"cc" , "r1","f1","v16", "v17","v18","v19","v20","v21","v22","v23",
"v24","v25","v26","v27","v28","v29","v30","v31"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
);
return dot;
"vl %%v24,0(%%r1,%3) \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,16(%%r1,%3) \n\t"
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vfmadb %%v0,%%v18,%%v26,%%v0 \n\t"
"vl %%v27,48(%%r1,%3) \n\t"
"vfmadb %%v0,%%v19,%%v27,%%v0 \n\t"
"vl %%v28,64(%%r1,%3) \n\t"
"vfmadb %%v0,%%v20,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%3) \n\t"
"vfmadb %%v0,%%v21,%%v29,%%v0 \n\t"
"vl %%v30,96(%%r1,%3) \n\t"
"vfmadb %%v0,%%v22,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vfmadb %%v0,%%v23,%%v31,%%v0 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 "
:"=f"(dot)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
#else
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y )
{
BLASLONG register i = 0;
FLOAT dot = 0.0;
while(i < n)
{
dot += y[i] * x[i]
+ y[i+1] * x[i+1]
+ y[i+2] * x[i+2]
+ y[i+3] * x[i+3]
+ y[i+4] * x[i+4]
+ y[i+5] * x[i+5]
+ y[i+6] * x[i+6]
+ y[i+7] * x[i+7] ;
dot += y[i+8] * x[i+8]
+ y[i+9] * x[i+9]
+ y[i+10] * x[i+10]
+ y[i+11] * x[i+11]
+ y[i+12] * x[i+12]
+ y[i+13] * x[i+13]
+ y[i+14] * x[i+14]
+ y[i+15] * x[i+15] ;
i+=16 ;
}
return dot;
}
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
@ -148,13 +91,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG n1 = n & -16;
if ( n1 ){
dot = ddot_kernel_16(n1, x, y );
i = n1;
}
if ( n1 )
dot = ddot_kernel_16(n1, x, y);
i = n1;
while(i < n)
{

View File

@ -25,186 +25,392 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#define NBMAX 2048
#define HAVE_KERNEL_4x4_VEC 1
#define HAVE_KERNEL_4x2_VEC 1
#define HAVE_KERNEL_4x1_VEC 1
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
#include <vecintrin.h>
#endif
#ifdef HAVE_KERNEL_4x4
#elif HAVE_KERNEL_4x4_VEC
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT x0,x1,x2,x3;
x0 = xo[0] * *alpha;
x1 = xo[1] * *alpha;
x2 = xo[2] * *alpha;
x3 = xo[3] * *alpha;
__vector double v_x0 = {x0,x0};
__vector double v_x1 = {x1,x1};
__vector double v_x2 = {x2,x2};
__vector double v_x3 = {x3,x3};
__vector double* v_y =(__vector double*)y;
__vector double* va0 = (__vector double*)ap[0];
__vector double* va1 = (__vector double*)ap[1];
__vector double* va2 = (__vector double*)ap[2];
__vector double* va3 = (__vector double*)ap[3];
__asm__ volatile (
"vlrepg %%v0,0(%5) \n\t"
"vlrepg %%v1,8(%5) \n\t"
"vlrepg %%v2,16(%5) \n\t"
"vlrepg %%v3,24(%5) \n\t"
"vlrepg %%v4,%7 \n\t"
"vfmdb %%v0,%%v0,%%v4 \n\t"
"vfmdb %%v1,%%v1,%%v4 \n\t"
"vfmdb %%v2,%%v2,%%v4 \n\t"
"vfmdb %%v3,%%v3,%%v4 \n\t"
"xgr %%r1,%%r1 \n\t"
for ( i=0; i< n/2; i+=2 )
{
v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ;
v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] + v_x2 * va2[i+1] + v_x3 * va3[i+1] ;
}
}
"lghi %%r0,-16 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
#else
"srlg %%r0,%%r0,4 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%4) \n\t"
"pfd 2,1024(%%r1,%6) \n\t"
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
FLOAT x[4] __attribute__ ((aligned (16)));
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,0(%%r1,%3) \n\t"
"vl %%v19,0(%%r1,%4) \n\t"
"vl %%v20,16(%%r1,%1) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,16(%%r1,%3) \n\t"
"vl %%v23,16(%%r1,%4) \n\t"
"vl %%v24,32(%%r1,%1) \n\t"
"vl %%v25,32(%%r1,%2) \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vl %%v27,32(%%r1,%4) \n\t"
"vl %%v28,48(%%r1,%1) \n\t"
"vl %%v29,48(%%r1,%2) \n\t"
"vl %%v30,48(%%r1,%3) \n\t"
"vl %%v31,48(%%r1,%4) \n\t"
for ( i=0; i<4; i++)
x[i] = xo[i] * *alpha;
"vl %%v4,0(%%r1,%6) \n\t"
"vfmadb %%v4,%%v16,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v17,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v18,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v19,%%v3,%%v4 \n\t"
"vst %%v4,0(%%r1,%6) \n\t"
for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
}
}
"vl %%v4,16(%%r1,%6) \n\t"
"vfmadb %%v4,%%v20,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v21,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v22,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v23,%%v3,%%v4 \n\t"
"vst %%v4,16(%%r1,%6) \n\t"
"vl %%v4,32(%%r1,%6) \n\t"
"vfmadb %%v4,%%v24,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v25,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v26,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v27,%%v3,%%v4 \n\t"
"vst %%v4,32(%%r1,%6) \n\t"
#endif
"vl %%v4,48(%%r1,%6) \n\t"
"vfmadb %%v4,%%v28,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v29,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v30,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v31,%%v3,%%v4 \n\t"
"vst %%v4,48(%%r1,%6) \n\t"
#ifdef HAVE_KERNEL_4x2
"vl %%v16,64(%%r1,%1) \n\t"
"vl %%v17,64(%%r1,%2) \n\t"
"vl %%v18,64(%%r1,%3) \n\t"
"vl %%v19,64(%%r1,%4) \n\t"
"vl %%v20,80(%%r1,%1) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,80(%%r1,%3) \n\t"
"vl %%v23,80(%%r1,%4) \n\t"
"vl %%v24,96(%%r1,%1) \n\t"
"vl %%v25,96(%%r1,%2) \n\t"
"vl %%v26,96(%%r1,%3) \n\t"
"vl %%v27,96(%%r1,%4) \n\t"
"vl %%v28,112(%%r1,%1) \n\t"
"vl %%v29,112(%%r1,%2) \n\t"
"vl %%v30,112(%%r1,%3) \n\t"
"vl %%v31,112(%%r1,%4) \n\t"
#elif HAVE_KERNEL_4x2_VEC
"vl %%v4,64(%%r1,%6) \n\t"
"vfmadb %%v4,%%v16,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v17,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v18,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v19,%%v3,%%v4 \n\t"
"vst %%v4,64(%%r1,%6) \n\t"
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT x0,x1;
x0 = xo[0] * *alpha;
x1 = xo[1] * *alpha;
__vector double v_x0 = {x0,x0};
__vector double v_x1 = {x1,x1};
__vector double* v_y =(__vector double*)y;
__vector double* va0 = (__vector double*)ap[0];
__vector double* va1 = (__vector double*)ap[1];
"vl %%v4,80(%%r1,%6) \n\t"
"vfmadb %%v4,%%v20,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v21,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v22,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v23,%%v3,%%v4 \n\t"
"vst %%v4,80(%%r1,%6) \n\t"
for ( i=0; i< n/2; i+=2 )
{
v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ;
v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ;
}
}
#else
"vl %%v4,96(%%r1,%6) \n\t"
"vfmadb %%v4,%%v24,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v25,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v26,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v27,%%v3,%%v4 \n\t"
"vst %%v4,96(%%r1,%6) \n\t"
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1;
FLOAT x[4] __attribute__ ((aligned (16)));
a0 = ap[0];
a1 = ap[1];
for ( i=0; i<2; i++)
x[i] = xo[i] * *alpha;
for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0] + a1[i]*x[1];
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1];
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1];
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1];
}
}
#endif
#ifdef HAVE_KERNEL_4x1
#elif HAVE_KERNEL_4x1_VEC
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT x0;
x0 = xo[0] * *alpha;
__vector double v_x0 = {x0,x0};
__vector double* v_y =(__vector double*)y;
__vector double* va0 = (__vector double*)ap;
for ( i=0; i< n/2; i+=2 )
{
v_y[i] += v_x0 * va0[i] ;
v_y[i+1] += v_x0 * va0[i+1] ;
}
"vl %%v4,112(%%r1,%6) \n\t"
"vfmadb %%v4,%%v28,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v29,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v30,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v31,%%v3,%%v4 \n\t"
"vst %%v4,112(%%r1,%6) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,12 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,0(%%r1,%3) \n\t"
"vl %%v19,0(%%r1,%4) \n\t"
"vl %%v20,16(%%r1,%1) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,16(%%r1,%3) \n\t"
"vl %%v23,16(%%r1,%4) \n\t"
"vl %%v4,0(%%r1,%6) \n\t"
"vfmadb %%v4,%%v16,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v17,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v18,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v19,%%v3,%%v4 \n\t"
"vst %%v4,0(%%r1,%6) \n\t"
"vl %%v4,16(%%r1,%6) \n\t"
"vfmadb %%v4,%%v20,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v21,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v22,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v23,%%v3,%%v4 \n\t"
"vst %%v4,16(%%r1,%6) \n\t"
"agfi %%r1,32 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"nop "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
#else
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0;
FLOAT x[4] __attribute__ ((aligned (16)));
a0 = ap;
__asm__ volatile (
"vlrepg %%v0,0(%3) \n\t"
"vlrepg %%v1,8(%3) \n\t"
"vlrepg %%v2,%5 \n\t"
"vfmdb %%v0,%%v0,%%v2 \n\t"
"vfmdb %%v1,%%v1,%%v2 \n\t"
"xgr %%r1,%%r1 \n\t"
for ( i=0; i<1; i++)
x[i] = xo[i] * *alpha;
"lghi %%r0,-16 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0];
y[i+1] += a0[i+1]*x[0];
y[i+2] += a0[i+2]*x[0];
y[i+3] += a0[i+3]*x[0];
}
"srlg %%r0,%%r0,4 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 2,1024(%%r1,%4) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,16(%%r1,%1) \n\t"
"vl %%v19,16(%%r1,%2) \n\t"
"vl %%v20,32(%%r1,%1) \n\t"
"vl %%v21,32(%%r1,%2) \n\t"
"vl %%v22,48(%%r1,%1) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"vl %%v24,64(%%r1,%1) \n\t"
"vl %%v25,64(%%r1,%2) \n\t"
"vl %%v26,80(%%r1,%1) \n\t"
"vl %%v27,80(%%r1,%2) \n\t"
"vl %%v28,96(%%r1,%1) \n\t"
"vl %%v29,96(%%r1,%2) \n\t"
"vl %%v30,112(%%r1,%1) \n\t"
"vl %%v31,112(%%r1,%2) \n\t"
"vl %%v2,0(%%r1,%4) \n\t"
"vfmadb %%v2,%%v16,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v17,%%v1,%%v2 \n\t"
"vst %%v2,0(%%r1,%4) \n\t"
"vl %%v2,16(%%r1,%4) \n\t"
"vfmadb %%v2,%%v18,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v19,%%v1,%%v2 \n\t"
"vst %%v2,16(%%r1,%4) \n\t"
"vl %%v2,32(%%r1,%4) \n\t"
"vfmadb %%v2,%%v20,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v21,%%v1,%%v2 \n\t"
"vst %%v2,32(%%r1,%4) \n\t"
"vl %%v2,48(%%r1,%4) \n\t"
"vfmadb %%v2,%%v22,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v23,%%v1,%%v2 \n\t"
"vst %%v2,48(%%r1,%4) \n\t"
"vl %%v2,64(%%r1,%4) \n\t"
"vfmadb %%v2,%%v24,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v25,%%v1,%%v2 \n\t"
"vst %%v2,64(%%r1,%4) \n\t"
"vl %%v2,80(%%r1,%4) \n\t"
"vfmadb %%v2,%%v26,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v27,%%v1,%%v2 \n\t"
"vst %%v2,80(%%r1,%4) \n\t"
"vl %%v2,96(%%r1,%4) \n\t"
"vfmadb %%v2,%%v28,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v29,%%v1,%%v2 \n\t"
"vst %%v2,96(%%r1,%4) \n\t"
"vl %%v2,112(%%r1,%4) \n\t"
"vfmadb %%v2,%%v30,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v31,%%v1,%%v2 \n\t"
"vst %%v2,112(%%r1,%4) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,12 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,16(%%r1,%1) \n\t"
"vl %%v19,16(%%r1,%2) \n\t"
"vl %%v2,0(%%r1,%4) \n\t"
"vfmadb %%v2,%%v16,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v17,%%v1,%%v2 \n\t"
"vst %%v2,0(%%r1,%4) \n\t"
"vl %%v2,16(%%r1,%4) \n\t"
"vfmadb %%v2,%%v18,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v19,%%v1,%%v2 \n\t"
"vst %%v2,16(%%r1,%4) \n\t"
"agfi %%r1,32 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"nop "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vlrepg %%v0,0(%2) \n\t"
"vlrepg %%v1,%4 \n\t"
"vfmdb %%v0,%%v0,%%v1 \n\t"
"xgr %%r1,%%r1 \n\t"
#endif
"lghi %%r0,-16 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
"srlg %%r0,%%r0,4 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,64(%%r1,%1) \n\t"
"vl %%v21,80(%%r1,%1) \n\t"
"vl %%v22,96(%%r1,%1) \n\t"
"vl %%v23,112(%%r1,%1) \n\t"
"vl %%v1,0(%%r1,%3) \n\t"
"vfmadb %%v1,%%v16,%%v0,%%v1 \n\t"
"vst %%v1,0(%%r1,%3) \n\t"
"vl %%v1,16(%%r1,%3) \n\t"
"vfmadb %%v1,%%v17,%%v0,%%v1 \n\t"
"vst %%v1,16(%%r1,%3) \n\t"
"vl %%v1,32(%%r1,%3) \n\t"
"vfmadb %%v1,%%v18,%%v0,%%v1 \n\t"
"vst %%v1,32(%%r1,%3) \n\t"
"vl %%v1,48(%%r1,%3) \n\t"
"vfmadb %%v1,%%v19,%%v0,%%v1 \n\t"
"vst %%v1,48(%%r1,%3) \n\t"
"vl %%v1,64(%%r1,%3) \n\t"
"vfmadb %%v1,%%v20,%%v0,%%v1 \n\t"
"vst %%v1,64(%%r1,%3) \n\t"
"vl %%v1,80(%%r1,%3) \n\t"
"vfmadb %%v1,%%v21,%%v0,%%v1 \n\t"
"vst %%v1,80(%%r1,%3) \n\t"
"vl %%v1,96(%%r1,%3) \n\t"
"vfmadb %%v1,%%v22,%%v0,%%v1 \n\t"
"vst %%v1,96(%%r1,%3) \n\t"
"vl %%v1,112(%%r1,%3) \n\t"
"vfmadb %%v1,%%v23,%%v0,%%v1 \n\t"
"vst %%v1,112(%%r1,%3) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,12 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v1,0(%%r1,%3) \n\t"
"vfmadb %%v1,%%v16,%%v0,%%v1 \n\t"
"vst %%v1,0(%%r1,%3) \n\t"
"vl %%v1,16(%%r1,%3) \n\t"
"vfmadb %%v1,%%v17,%%v0,%%v1 \n\t"
"vst %%v1,16(%%r1,%3) \n\t"
"agfi %%r1,32 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"nop "
:
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
BLASLONG i;
for ( i=0; i<n; i++ ){
*dest += *src;
src++;
dest += inc_dest;
for (i = 0; i < n; i++)
{
*dest += src[i];
dest += inc_dest;
}
return;
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;

View File

@ -25,178 +25,460 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#define HAVE_KERNEL_4x4_VEC 1
#define HAVE_KERNEL_4x2_VEC 1
#define HAVE_KERNEL_4x1_VEC 1
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
#include <vecintrin.h>
#endif
#define NBMAX 2048
#ifdef HAVE_KERNEL_4x4
#elif HAVE_KERNEL_4x4_VEC
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
__vector double* va0 = (__vector double*)ap[0];
__vector double* va1 = (__vector double*)ap[1];
__vector double* va2 = (__vector double*)ap[2];
__vector double* va3 = (__vector double*)ap[3];
__vector double* v_x =(__vector double*)x;
__vector double temp0 = {0,0};
__vector double temp1 = {0,0};
__vector double temp2 = {0,0};
__vector double temp3 = {0,0};
__asm__ volatile (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"xgr %%r1,%%r1 \n\t"
for ( i=0; i< n/2; i+=2 )
{
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ;
temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ;
temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1] ;
temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1] ;
}
y[0] = temp0[0] + temp0[1];
y[1] = temp1[0] + temp1[1];
y[2] = temp2[0] + temp2[1];
y[3] = temp3[0] + temp3[1];;
"lghi %%r0,-16 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
"srlg %%r0,%%r0,4 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%4) \n\t"
"pfd 1,1024(%%r1,%5) \n\t"
"vl %%v16,0(%%r1,%5) \n\t"
"vl %%v17,16(%%r1,%5) \n\t"
"vl %%v18,32(%%r1,%5) \n\t"
"vl %%v19,48(%%r1,%5) \n\t"
"vl %%v20,64(%%r1,%5) \n\t"
"vl %%v21,80(%%r1,%5) \n\t"
"vl %%v22,96(%%r1,%5) \n\t"
"vl %%v23,112(%%r1,%5) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmadb %%v1,%%v16,%%v25,%%v1 \n\t"
"vl %%v26,0(%%r1,%3) \n\t"
"vfmadb %%v2,%%v16,%%v26,%%v2 \n\t"
"vl %%v27,0(%%r1,%4) \n\t"
"vfmadb %%v3,%%v16,%%v27,%%v3 \n\t"
"vl %%v28,16(%%r1,%1) \n\t"
"vfmadb %%v0,%%v17,%%v28,%%v0 \n\t"
"vl %%v29,16(%%r1,%2) \n\t"
"vfmadb %%v1,%%v17,%%v29,%%v1 \n\t"
"vl %%v30,16(%%r1,%3) \n\t"
"vfmadb %%v2,%%v17,%%v30,%%v2 \n\t"
"vl %%v31,16(%%r1,%4) \n\t"
"vfmadb %%v3,%%v17,%%v31,%%v3 \n\t"
"vl %%v24,32(%%r1,%1) \n\t"
"vfmadb %%v0,%%v18,%%v24,%%v0 \n\t"
"vl %%v25,32(%%r1,%2) \n\t"
"vfmadb %%v1,%%v18,%%v25,%%v1 \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vfmadb %%v2,%%v18,%%v26,%%v2 \n\t"
"vl %%v27,32(%%r1,%4) \n\t"
"vfmadb %%v3,%%v18,%%v27,%%v3 \n\t"
"vl %%v28,48(%%r1,%1) \n\t"
"vfmadb %%v0,%%v19,%%v28,%%v0 \n\t"
"vl %%v29,48(%%r1,%2) \n\t"
"vfmadb %%v1,%%v19,%%v29,%%v1 \n\t"
"vl %%v30,48(%%r1,%3) \n\t"
"vfmadb %%v2,%%v19,%%v30,%%v2 \n\t"
"vl %%v31,48(%%r1,%4) \n\t"
"vfmadb %%v3,%%v19,%%v31,%%v3 \n\t"
"vl %%v24,64(%%r1,%1) \n\t"
"vfmadb %%v0,%%v20,%%v24,%%v0 \n\t"
"vl %%v25,64(%%r1,%2) \n\t"
"vfmadb %%v1,%%v20,%%v25,%%v1 \n\t"
"vl %%v26,64(%%r1,%3) \n\t"
"vfmadb %%v2,%%v20,%%v26,%%v2 \n\t"
"vl %%v27,64(%%r1,%4) \n\t"
"vfmadb %%v3,%%v20,%%v27,%%v3 \n\t"
"vl %%v28,80(%%r1,%1) \n\t"
"vfmadb %%v0,%%v21,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%2) \n\t"
"vfmadb %%v1,%%v21,%%v29,%%v1 \n\t"
"vl %%v30,80(%%r1,%3) \n\t"
"vfmadb %%v2,%%v21,%%v30,%%v2 \n\t"
"vl %%v31,80(%%r1,%4) \n\t"
"vfmadb %%v3,%%v21,%%v31,%%v3 \n\t"
"vl %%v24,96(%%r1,%1) \n\t"
"vfmadb %%v0,%%v22,%%v24,%%v0 \n\t"
"vl %%v25,96(%%r1,%2) \n\t"
"vfmadb %%v1,%%v22,%%v25,%%v1 \n\t"
"vl %%v26,96(%%r1,%3) \n\t"
"vfmadb %%v2,%%v22,%%v26,%%v2 \n\t"
"vl %%v27,96(%%r1,%4) \n\t"
"vfmadb %%v3,%%v22,%%v27,%%v3 \n\t"
"vl %%v28,112(%%r1,%1) \n\t"
"vfmadb %%v0,%%v23,%%v28,%%v0 \n\t"
"vl %%v29,112(%%r1,%2) \n\t"
"vfmadb %%v1,%%v23,%%v29,%%v1 \n\t"
"vl %%v30,112(%%r1,%3) \n\t"
"vfmadb %%v2,%%v23,%%v30,%%v2 \n\t"
"vl %%v31,112(%%r1,%4) \n\t"
"vfmadb %%v3,%%v23,%%v31,%%v3 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,12 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%5) \n\t"
"vl %%v17,16(%%r1,%5) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmadb %%v1,%%v16,%%v25,%%v1 \n\t"
"vl %%v26,0(%%r1,%3) \n\t"
"vfmadb %%v2,%%v16,%%v26,%%v2 \n\t"
"vl %%v27,0(%%r1,%4) \n\t"
"vfmadb %%v3,%%v16,%%v27,%%v3 \n\t"
"vl %%v28,16(%%r1,%1) \n\t"
"vfmadb %%v0,%%v17,%%v28,%%v0 \n\t"
"vl %%v29,16(%%r1,%2) \n\t"
"vfmadb %%v1,%%v17,%%v29,%%v1 \n\t"
"vl %%v30,16(%%r1,%3) \n\t"
"vfmadb %%v2,%%v17,%%v30,%%v2 \n\t"
"vl %%v31,16(%%r1,%4) \n\t"
"vfmadb %%v3,%%v17,%%v31,%%v3 \n\t"
"agfi %%r1,32 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"vrepg %%v4,%%v0,1 \n\t"
"adbr %%f0,%%f4 \n\t"
"std %%f0,0(%6) \n\t"
"vrepg %%v4,%%v1,1 \n\t"
"adbr %%f1,%%f4 \n\t"
"std %%f1,8(%6) \n\t"
"vrepg %%v4,%%v2,1 \n\t"
"adbr %%f2,%%f4 \n\t"
"std %%f2,16(%6) \n\t"
"vrepg %%v4,%%v3,1 \n\t"
"adbr %%f3,%%f4 \n\t"
"std %%f3,24(%6) "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
#else
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
FLOAT temp3 = 0.0;
for ( i=0; i< n; i+=4 )
{
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];
temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];
temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];
}
y[0] = temp0;
y[1] = temp1;
y[2] = temp2;
y[3] = temp3;
}
#endif
#ifdef HAVE_KERNEL_4x2
#elif HAVE_KERNEL_4x2_VEC
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
__vector double* va0 = (__vector double*)ap[0];
__vector double* va1 = (__vector double*)ap[1];
__vector double* v_x =(__vector double*)x;
__vector double temp0 = {0,0};
__vector double temp1 = {0,0};
__asm__ volatile (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"xgr %%r1,%%r1 \n\t"
for ( i=0; i< n/2; i+=2 )
{
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ;
temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ;
}
y[0] = temp0[0] + temp0[1];
y[1] = temp1[0] + temp1[1];
"lghi %%r0,-16 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
"srlg %%r0,%%r0,4 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmadb %%v1,%%v16,%%v25,%%v1 \n\t"
"vl %%v26,16(%%r1,%1) \n\t"
"vfmadb %%v0,%%v17,%%v26,%%v0 \n\t"
"vl %%v27,16(%%r1,%2) \n\t"
"vfmadb %%v1,%%v17,%%v27,%%v1 \n\t"
"vl %%v28,32(%%r1,%1) \n\t"
"vfmadb %%v0,%%v18,%%v28,%%v0 \n\t"
"vl %%v29,32(%%r1,%2) \n\t"
"vfmadb %%v1,%%v18,%%v29,%%v1 \n\t"
"vl %%v30,48(%%r1,%1) \n\t"
"vfmadb %%v0,%%v19,%%v30,%%v0 \n\t"
"vl %%v31,48(%%r1,%2) \n\t"
"vfmadb %%v1,%%v19,%%v31,%%v1 \n\t"
"vl %%v24,64(%%r1,%1) \n\t"
"vfmadb %%v0,%%v20,%%v24,%%v0 \n\t"
"vl %%v25,64(%%r1,%2) \n\t"
"vfmadb %%v1,%%v20,%%v25,%%v1 \n\t"
"vl %%v26,80(%%r1,%1) \n\t"
"vfmadb %%v0,%%v21,%%v26,%%v0 \n\t"
"vl %%v27,80(%%r1,%2) \n\t"
"vfmadb %%v1,%%v21,%%v27,%%v1 \n\t"
"vl %%v28,96(%%r1,%1) \n\t"
"vfmadb %%v0,%%v22,%%v28,%%v0 \n\t"
"vl %%v29,96(%%r1,%2) \n\t"
"vfmadb %%v1,%%v22,%%v29,%%v1 \n\t"
"vl %%v30,112(%%r1,%1) \n\t"
"vfmadb %%v0,%%v23,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%2) \n\t"
"vfmadb %%v1,%%v23,%%v31,%%v1 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,12 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmadb %%v1,%%v16,%%v25,%%v1 \n\t"
"vl %%v26,16(%%r1,%1) \n\t"
"vfmadb %%v0,%%v17,%%v26,%%v0 \n\t"
"vl %%v27,16(%%r1,%2) \n\t"
"vfmadb %%v1,%%v17,%%v27,%%v1 \n\t"
"agfi %%r1,32 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"vrepg %%v2,%%v0,1 \n\t"
"adbr %%f0,%%f2 \n\t"
"std %%f0,0(%4) \n\t"
"vrepg %%v2,%%v1,1 \n\t"
"adbr %%f1,%%f2 \n\t"
"std %%f1,8(%4) "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
#else
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0,*a1;
a0 = ap[0];
a1 = ap[1];
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
for ( i=0; i< n; i+=4 )
{
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];
}
y[0] = temp0;
y[1] = temp1;
}
#endif
#ifdef HAVE_KERNEL_4x1
#elif HAVE_KERNEL_4x1_VEC
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y)
{
BLASLONG i;
__vector double* va0 = (__vector double*)a0;
__vector double* v_x =(__vector double*)x;
__vector double temp0 = {0,0};
__asm__ volatile (
"vzero %%v0 \n\t"
"xgr %%r1,%%r1 \n\t"
for ( i=0; i< n/2; i+=2 )
{
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ;
}
y[0] = temp0[0] + temp0[1];
}
#else
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT temp0 = 0.0;
"lghi %%r0,-16 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
for ( i=0; i< n; i+=4 )
{
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
}
y[0] = temp0;
"srlg %%r0,%%r0,4 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,16(%%r1,%1) \n\t"
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
"vl %%v26,32(%%r1,%1) \n\t"
"vfmadb %%v0,%%v18,%%v26,%%v0 \n\t"
"vl %%v27,48(%%r1,%1) \n\t"
"vfmadb %%v0,%%v19,%%v27,%%v0 \n\t"
"vl %%v28,64(%%r1,%1) \n\t"
"vfmadb %%v0,%%v20,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%1) \n\t"
"vfmadb %%v0,%%v21,%%v29,%%v0 \n\t"
"vl %%v30,96(%%r1,%1) \n\t"
"vfmadb %%v0,%%v22,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%1) \n\t"
"vfmadb %%v0,%%v23,%%v31,%%v0 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,12 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,16(%%r1,%1) \n\t"
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
"agfi %%r1,32 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"std %%f0,0(%3) "
:
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
#endif
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
{
BLASLONG i;
for ( i=0; i<n; i++ )
{
*dest = *src;
dest++;
src += inc_src;
}
BLASLONG i;
for (i = 0; i < n; i++)
{
dest[i] = *src;
src += inc_src;
}
}
static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest)
{
__asm__ volatile (
"vlrepg %%v0,%1 \n\t"
"xgr %%r1,%%r1 \n\t"
"lghi %%r0,-16 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
"srlg %%r0,%%r0,4 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%3) \n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t"
"vst %%v24, 0(%%r1,%3) \n\t"
"vl %%v25, 16(%%r1,%3) \n\t"
"vfmadb %%v25,%%v17,%%v0,%%v25 \n\t"
"vst %%v25, 16(%%r1,%3) \n\t"
"vl %%v26, 32(%%r1,%3) \n\t"
"vfmadb %%v26,%%v18,%%v0,%%v26 \n\t"
"vst %%v26, 32(%%r1,%3) \n\t"
"vl %%v27, 48(%%r1,%3) \n\t"
"vfmadb %%v27,%%v19,%%v0,%%v27 \n\t"
"vst %%v27, 48(%%r1,%3) \n\t"
"vl %%v28, 64(%%r1,%3) \n\t"
"vfmadb %%v28,%%v20,%%v0,%%v28 \n\t"
"vst %%v28, 64(%%r1,%3) \n\t"
"vl %%v29, 80(%%r1,%3) \n\t"
"vfmadb %%v29,%%v21,%%v0,%%v29 \n\t"
"vst %%v29, 80(%%r1,%3) \n\t"
"vl %%v30, 96(%%r1,%3) \n\t"
"vfmadb %%v30,%%v22,%%v0,%%v30 \n\t"
"vst %%v30, 96(%%r1,%3) \n\t"
"vl %%v31, 112(%%r1,%3) \n\t"
"vfmadb %%v31,%%v23,%%v0,%%v31 \n\t"
"vst %%v31, 112(%%r1,%3) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,12 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%3) \n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t"
"vst %%v24, 0(%%r1,%3) \n\t"
"vl %%v25, 16(%%r1,%3) \n\t"
"vfmadb %%v25,%%v17,%%v0,%%v25 \n\t"
"vst %%v25, 16(%%r1,%3) \n\t"
"agfi %%r1,32 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"nop "
:
:"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
if (inc_dest == 1)
add_y_kernel_4(n, da, src, dest);
else
{
BLASLONG i;
for ( i=0; i<n; i++ )
for (i = 0; i < n; i++)
{
*dest += src[i] * da;
dest += inc_dest;
*dest += src[i] * da;
dest += inc_dest;
}
return;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@ -212,7 +494,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
FLOAT ybuffer[4],*xbuffer;
FLOAT ybuffer[2] __attribute__ ((aligned(16)));
FLOAT *xbuffer;
FLOAT *ytemp;
if ( m < 1 ) return(0);
@ -234,7 +517,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{

182
kernel/zarch/dmax.c Normal file
View File

@ -0,0 +1,182 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT max;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vfchdb %%v26,%%v20,%%v21 \n\t"
"vfchdb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v24,%%v25 \n\t"
"vfchdb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vfchdb %%v26,%%v20,%%v21 \n\t"
"vfchdb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v24,%%v25 \n\t"
"vfchdb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return max;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
maxf = dmax_kernel_32(n1, x);
i = n1;
}
else
{
maxf=x[0];
i++;
}
while (i < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i++;
}
return (maxf);
} else {
maxf=x[0];
i += inc_x;
j++;
BLASLONG n1 = (n - 1) & -4;
while (j < n1) {
if (x[i] > maxf) {
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
maxf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i += inc_x;
j++;
}
return (maxf);
}
}

182
kernel/zarch/dmin.c Normal file
View File

@ -0,0 +1,182 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT min;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vfchdb %%v26,%%v21,%%v20 \n\t"
"vfchdb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v25,%%v24 \n\t"
"vfchdb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vfchdb %%v26,%%v21,%%v20 \n\t"
"vfchdb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v25,%%v24 \n\t"
"vfchdb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return min;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
minf = dmin_kernel_32(n1, x);
i = n1;
}
else
{
minf=x[0];
i++;
}
while (i < n) {
if (x[i] < minf) {
minf = x[i];
}
i++;
}
return (minf);
} else {
minf=x[0];
i += inc_x;
j++;
BLASLONG n1 = (n - 1) & -4;
while (j < n1) {
if (x[i] < minf) {
minf = x[i];
}
if (x[i + inc_x] < minf) {
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
minf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] < minf) {
minf = x[i];
}
i += inc_x;
j++;
}
return (minf);
}
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,176 +27,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA)
static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
__asm__ (
"pfd 2, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"lgdr %%r1,%[cos] \n\t"
"vlvgp %%v0,%%r1,%%r1 \n\t"
"lgdr %%r1,%[sin] \n\t"
"vlvgp %%v1,%%r1,%%r1 \n\t"
"srlg %[n_tmp],%[n_tmp],5 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 16(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 32(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 48(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 64(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 80(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 96(%%r1,%[ptr_y]) \n\t"
"vl %%v19, 112(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 80(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 96(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 112(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 144(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 160(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 176(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 192(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 208(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 224(%%r1,%[ptr_y]) \n\t"
"vl %%v19, 240(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 240(%%r1,%[ptr_y]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %[n_tmp],1b"
: [mem_x] "+m" (*(double (*)[n])x),
[mem_y] "+m" (*(double (*)[n])y),
[n_tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA)
: "cc", "r1" ,"v0","v1","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
__asm__ (
"vlrepg %%v0,%3 \n\t"
"vlrepg %%v1,%4 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%1) \n\t"
"vst %%v29, 16(%%r1,%1) \n\t"
"vst %%v30, 32(%%r1,%1) \n\t"
"vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v23, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%1) \n\t"
"vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%1) \n\t"
"vst %%v29, 144(%%r1,%1) \n\t"
"vst %%v30, 160(%%r1,%1) \n\t"
"vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v23, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
@ -214,8 +204,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
drot_kernel_32(n1, x, y, c, s);
FLOAT cosa,sina;
cosa=c;
sina=s;
drot_kernel_32(n1, x, y, &cosa, &sina);
i=n1;
}
@ -229,6 +221,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
}
}
else
{
@ -250,3 +243,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
}

View File

@ -27,135 +27,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#ifdef Z13_A
static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x )
static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x)
{
__asm__ volatile (
"vlrepg %%v0,%1 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%2) \n\t"
"vfmdb %%v24,%%v24,%%v0 \n\t"
"vst %%v24, 0(%%r1,%2) \n\t"
"vl %%v25, 16(%%r1,%2) \n\t"
"vfmdb %%v25,%%v25,%%v0 \n\t"
"vst %%v25, 16(%%r1,%2) \n\t"
"vl %%v26, 32(%%r1,%2) \n\t"
"vfmdb %%v26,%%v26,%%v0 \n\t"
"vst %%v26, 32(%%r1,%2) \n\t"
"vl %%v27, 48(%%r1,%2) \n\t"
"vfmdb %%v27,%%v27,%%v0 \n\t"
"vst %%v27, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%2) \n\t"
"vfmdb %%v24,%%v24,%%v0 \n\t"
"vst %%v24, 64(%%r1,%2) \n\t"
"vl %%v25, 80(%%r1,%2) \n\t"
"vfmdb %%v25,%%v25,%%v0 \n\t"
"vst %%v25, 80(%%r1,%2) \n\t"
"vl %%v26, 96(%%r1,%2) \n\t"
"vfmdb %%v26,%%v26,%%v0 \n\t"
"vst %%v26, 96(%%r1,%2) \n\t"
"vl %%v27, 112(%%r1,%2) \n\t"
"vfmdb %%v27,%%v27,%%v0 \n\t"
"vst %%v27, 112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v24","v25","v26","v27"
);
}
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
"lgdr %%r0,%[alpha] \n\t"
"vlvgp %%v0,%%r0,%%r0 \n\t"
"srlg %[n],%[n],4 \n\t"
"vlr %%v1,%%v0 \n\t"
"vlm %%v16,%%v23, 0(%[x_ptr]) \n\t"
"la %[x_ptr], 128(%[x_ptr]) \n\t"
"aghik %[n], %[n], -1 \n\t"
"jle 2f \n\t"
".align 16 \n\t"
"1: \n\t"
"vfmdb %%v24, %%v16, %%v0 \n\t"
"vfmdb %%v25, %%v17, %%v0 \n\t"
"vfmdb %%v26, %%v18, %%v0 \n\t"
"vfmdb %%v27, %%v19, %%v1 \n\t"
"vlm %%v16,%%v19, 0(%[x_ptr]) \n\t"
"vfmdb %%v28, %%v20, %%v0 \n\t"
"vfmdb %%v29, %%v21, %%v1 \n\t"
"vfmdb %%v30, %%v22, %%v0 \n\t"
"vfmdb %%v31, %%v23, %%v1 \n\t"
"vlm %%v20,%%v23, 64(%[x_ptr]) \n\t"
"lay %[x_ptr], -128(%[x_ptr]) \n\t"
"vstm %%v24,%%v31, 0(%[x_ptr]) \n\t"
"la %[x_ptr],256(%[x_ptr]) \n\t"
"brctg %[n],1b \n\t"
"2: \n\t"
"vfmdb %%v24, %%v16, %%v0 \n\t"
"vfmdb %%v25, %%v17, %%v1 \n\t"
"vfmdb %%v26, %%v18, %%v0 \n\t"
"vfmdb %%v27, %%v19, %%v1 \n\t"
"lay %[x_ptr] , -128(%[x_ptr]) \n\t"
"vfmdb %%v28, %%v20, %%v0 \n\t"
"vfmdb %%v29, %%v21, %%v1 \n\t"
"vfmdb %%v30, %%v22, %%v0 \n\t"
"vfmdb %%v31, %%v23, %%v1 \n\t"
"vstm %%v24,%%v31, 0(%[x_ptr]) \n\t"
: [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x),[n] "+&r"(n)
: [alpha] "f"(da)
:"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21",
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
#else
static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x )
static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
/* faster than sequence of triples(vl vfmd vst) (tested OPENBLAS_LOOPS=10000) */
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
"lgdr %%r0,%[alpha] \n\t"
"vlvgp %%v0,%%r0,%%r0 \n\t"
"vlr %%v1,%%v0 \n\t"
"sllg %%r0,%[n],3 \n\t"
"agr %%r0,%[x_ptr] \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%[x_ptr]) \n\t"
"vlm %%v16,%%v23, 0(%[x_ptr]) \n\t"
"vfmdb %%v16,%%v16,%%v0 \n\t"
"vfmdb %%v17,%%v17,%%v1 \n\t"
"vfmdb %%v18,%%v18,%%v0 \n\t"
"vfmdb %%v19,%%v19,%%v1 \n\t"
"vfmdb %%v20,%%v20,%%v0 \n\t"
"vfmdb %%v21,%%v21,%%v1 \n\t"
"vfmdb %%v22,%%v22,%%v0 \n\t"
"vfmdb %%v23,%%v23,%%v1 \n\t"
"vstm %%v16,%%v23, 0(%[x_ptr]) \n\t"
"vlm %%v24,%%v31,128(%[x_ptr]) \n\t"
"vfmdb %%v24,%%v24,%%v0 \n\t"
"vfmdb %%v25,%%v25,%%v1 \n\t"
"vfmdb %%v26,%%v26,%%v0 \n\t"
"vfmdb %%v27,%%v27,%%v1 \n\t"
"vfmdb %%v28,%%v28,%%v0 \n\t"
"vfmdb %%v29,%%v29,%%v1 \n\t"
"vfmdb %%v30,%%v30,%%v0 \n\t"
"vfmdb %%v31,%%v31,%%v1 \n\t"
"vstm %%v24,%%v31,128(%[x_ptr]) \n\t"
"la %[x_ptr], 256(%[x_ptr]) \n\t"
"clgrjl %[x_ptr],%%r0,1b \n\t"
: [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x)
: [n] "r"(n),[alpha] "f"(da)
:"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21",
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
#endif
static void dscal_kernel_32_zero( BLASLONG n, FLOAT *x )
{
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
"vzero %%v24 \n\t"
"sllg %%r0,%[n],3 \n\t"
"vzero %%v25 \n\t"
"agr %%r0,%[x_ptr] \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%[x_ptr]) \n\t"
"vst %%v24, 0(%[x_ptr]) \n\t"
"vst %%v25, 16(%[x_ptr]) \n\t"
"vst %%v24, 32(%[x_ptr]) \n\t"
"vst %%v25, 48(%[x_ptr]) \n\t"
"vst %%v24, 64(%[x_ptr]) \n\t"
"vst %%v25, 80(%[x_ptr]) \n\t"
"vst %%v24, 96(%[x_ptr]) \n\t"
"vst %%v25, 112(%[x_ptr]) \n\t"
"vst %%v24, 128(%[x_ptr]) \n\t"
"vst %%v25, 144(%[x_ptr]) \n\t"
"vst %%v24, 160(%[x_ptr]) \n\t"
"vst %%v25, 176(%[x_ptr]) \n\t"
"vst %%v24, 192(%[x_ptr]) \n\t"
"vst %%v25, 208(%[x_ptr]) \n\t"
"vst %%v24, 224(%[x_ptr]) \n\t"
"vst %%v25, 240(%[x_ptr]) \n\t"
"la %[x_ptr],256(%[x_ptr]) \n\t"
"clgrjl %[x_ptr],%%r0,1b \n\t"
: [mem] "=m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x)
: [n] "r"(n)
:"cc" , "r0", "v24" ,"v25"
);
"vst %%v24,0(%%r1,%1) \n\t"
"vst %%v25,16(%%r1,%1) \n\t"
"vst %%v26,32(%%r1,%1) \n\t"
"vst %%v27,48(%%r1,%1) \n\t"
"vst %%v24,64(%%r1,%1) \n\t"
"vst %%v25,80(%%r1,%1) \n\t"
"vst %%v26,96(%%r1,%1) \n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x)
:"memory","cc","r0","r1","v24","v25","v26","v27"
);
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0,j=0;
@ -169,11 +109,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( da == 0.0 )
{
BLASLONG n1 = n & -32;
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
dscal_kernel_32_zero(n1 , x);
dscal_kernel_16_zero(n1, x);
j=n1;
}
@ -188,10 +128,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
else
{
BLASLONG n1 = n & -32;
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
dscal_kernel_32(n1 , da , x);
dscal_kernel_16(n1, da, x);
j=n1;
}
while(j < n)
@ -260,4 +200,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
}
return 0;
}
}

180
kernel/zarch/dsdot.c Normal file
View File

@ -0,0 +1,180 @@
/***************************************************************************
Copyright (c) 2013-2018,The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms,with or without
modification,are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice,this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice,this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL
DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static double dsdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
double dot;
__asm__ volatile (
"vzero %%v0 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,0(%%r1,%3) \n\t"
"vfmsb %%v16,%%v16,%%v24 \n\t"
"vl %%v25,16(%%r1,%3) \n\t"
"vfmsb %%v17,%%v17,%%v25 \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vfmsb %%v18,%%v18,%%v26 \n\t"
"vl %%v27,48(%%r1,%3) \n\t"
"vfmsb %%v19,%%v19,%%v27 \n\t"
"vl %%v28,64(%%r1,%3) \n\t"
"vfmsb %%v20,%%v20,%%v28 \n\t"
"vl %%v29,80(%%r1,%3) \n\t"
"vfmsb %%v21,%%v21,%%v29 \n\t"
"vl %%v30,96(%%r1,%3) \n\t"
"vfmsb %%v22,%%v22,%%v30 \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vfmsb %%v23,%%v23,%%v31 \n\t"
"vflls %%v24,%%v16 \n\t"
"vflls %%v25,%%v17 \n\t"
"vflls %%v26,%%v18 \n\t"
"vflls %%v27,%%v19 \n\t"
"vflls %%v28,%%v20 \n\t"
"vflls %%v29,%%v21 \n\t"
"vflls %%v30,%%v22 \n\t"
"vflls %%v31,%%v23 \n\t"
"veslg %%v16,%%v16,32 \n\t"
"veslg %%v17,%%v17,32 \n\t"
"veslg %%v18,%%v18,32 \n\t"
"veslg %%v19,%%v19,32 \n\t"
"veslg %%v20,%%v20,32 \n\t"
"veslg %%v21,%%v21,32 \n\t"
"veslg %%v22,%%v22,32 \n\t"
"veslg %%v23,%%v23,32 \n\t"
"vflls %%v16,%%v16 \n\t"
"vflls %%v17,%%v17 \n\t"
"vflls %%v18,%%v18 \n\t"
"vflls %%v19,%%v19 \n\t"
"vflls %%v20,%%v20 \n\t"
"vflls %%v21,%%v21 \n\t"
"vflls %%v22,%%v22 \n\t"
"vflls %%v23,%%v23 \n\t"
"vfadb %%v16,%%v16,%%v24 \n\t"
"vfadb %%v17,%%v17,%%v25 \n\t"
"vfadb %%v18,%%v18,%%v26 \n\t"
"vfadb %%v19,%%v19,%%v27 \n\t"
"vfadb %%v20,%%v20,%%v28 \n\t"
"vfadb %%v21,%%v21,%%v29 \n\t"
"vfadb %%v22,%%v22,%%v30 \n\t"
"vfadb %%v23,%%v23,%%v31 \n\t"
"vfadb %%v16,%%v16,%%v20 \n\t"
"vfadb %%v17,%%v17,%%v21 \n\t"
"vfadb %%v18,%%v18,%%v22 \n\t"
"vfadb %%v19,%%v19,%%v23 \n\t"
"vfadb %%v16,%%v16,%%v18 \n\t"
"vfadb %%v17,%%v17,%%v19 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v0,%%v16,%%v0 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 "
:"=f"(dot)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return dot;
}
double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
double dot = 0.0 ;
if ( n <= 0 ) return(dot);
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -32;
if ( n1 )
dot = dsdot_kernel_32(n1,x,y);
i = n1;
while(i < n)
{
dot += y[i] * x[i] ;
i++ ;
}
return(dot);
}
BLASLONG n1 = n & -2;
while(i < n1)
{
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
ix += inc_x*2 ;
iy += inc_y*2 ;
i+=2 ;
}
while(i < n)
{
dot += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(dot);
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -25,217 +25,93 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(Z13_SWAP_A)
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"pfd 1, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],5 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v16, 0(%%r1,%[ptr_x]) \n\t"
__asm__ volatile(
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v17, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v18, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
"vst %%v19, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v28, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v20, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v20, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v29, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v21, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v30, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v31, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v16, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v17, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v18, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v19, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v20, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v20, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v21, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 240(%%r1,%[ptr_x]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %[n_tmp],1b"
: [mem_x] "+m" (*(double (*)[n])x),
[mem_y] "+m" (*(double (*)[n])y),
[n_tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23"
,"v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
"vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v1, 144(%%r1,%2) \n\t"
"vl %%v2, 160(%%r1,%2) \n\t"
"vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v6, 224(%%r1,%2) \n\t"
"vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vst %%v2, 160(%%r1,%1) \n\t"
"vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vst %%v7, 240(%%r1,%1) \n\t"
"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
#else
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"pfd 2, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],5 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_x]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v20, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v21, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v0, 0(%%r1,%[ptr_y]) \n\t"
"vl %%v1, 16(%%r1,%[ptr_y]) \n\t"
"vl %%v2, 32(%%r1,%[ptr_y]) \n\t"
"vl %%v3, 48(%%r1,%[ptr_y]) \n\t"
"vl %%v4, 64(%%r1,%[ptr_y]) \n\t"
"vl %%v5, 80(%%r1,%[ptr_y]) \n\t"
"vl %%v6, 96(%%r1,%[ptr_y]) \n\t"
"vl %%v7, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v0, 0(%%r1,%[ptr_x]) \n\t"
"vst %%v1, 16(%%r1,%[ptr_x]) \n\t"
"vst %%v2, 32(%%r1,%[ptr_x]) \n\t"
"vst %%v3, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v4, 64(%%r1,%[ptr_x]) \n\t"
"vst %%v5, 80(%%r1,%[ptr_x]) \n\t"
"vst %%v6, 96(%%r1,%[ptr_x]) \n\t"
"vst %%v7, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v0, 128(%%r1,%[ptr_y]) \n\t"
"vl %%v1, 144(%%r1,%[ptr_y]) \n\t"
"vl %%v2, 160(%%r1,%[ptr_y]) \n\t"
"vl %%v3, 176(%%r1,%[ptr_y]) \n\t"
"vl %%v4, 192(%%r1,%[ptr_y]) \n\t"
"vl %%v5, 208(%%r1,%[ptr_y]) \n\t"
"vl %%v6, 224(%%r1,%[ptr_y]) \n\t"
"vl %%v7, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v0, 128(%%r1,%[ptr_x]) \n\t"
"vst %%v1, 144(%%r1,%[ptr_x]) \n\t"
"vst %%v2, 160(%%r1,%[ptr_x]) \n\t"
"vst %%v3, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v4, 192(%%r1,%[ptr_x]) \n\t"
"vst %%v5, 208(%%r1,%[ptr_x]) \n\t"
"vst %%v6, 224(%%r1,%[ptr_x]) \n\t"
"vst %%v7, 240(%%r1,%[ptr_x]) \n\t"
"vst %%v16, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v17, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v18, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v19, 48(%%r1,%[ptr_y]) \n\t"
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %[n_tmp],1b"
: [mem_x] "+m" (*(double (*)[n])x),
[mem_y] "+m" (*(double (*)[n])y),
[n_tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
@ -284,5 +160,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
}

319
kernel/zarch/icamax.c Normal file
View File

@ -0,0 +1,319 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
{
BLASLONG iamax;
__asm__ volatile (
"vlef %%v0,0(%3),0 \n\t"
"vlef %%v1,4(%3),0 \n\t"
"vlef %%v0,8(%3),1 \n\t"
"vlef %%v1,12(%3),1 \n\t"
"vlef %%v0,16(%3),2 \n\t"
"vlef %%v1,20(%3),2 \n\t"
"vlef %%v0,24(%3),3 \n\t"
"vlef %%v1,28(%3),3 \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vflpsb %%v1,%%v1 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,16 \n\t"
"vzero %%v4 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%3) \n\t"
"vlef %%v16,0(%%r1,%3),0 \n\t"
"vlef %%v17,4(%%r1,%3),0 \n\t"
"vlef %%v16,8(%%r1,%3),1 \n\t"
"vlef %%v17,12(%%r1,%3),1 \n\t"
"vlef %%v16,16(%%r1,%3),2 \n\t"
"vlef %%v17,20(%%r1,%3),2 \n\t"
"vlef %%v16,24(%%r1,%3),3 \n\t"
"vlef %%v17,28(%%r1,%3),3 \n\t"
"vlef %%v18,32(%%r1,%3),0 \n\t"
"vlef %%v19,36(%%r1,%3),0 \n\t"
"vlef %%v18,40(%%r1,%3),1 \n\t"
"vlef %%v19,44(%%r1,%3),1 \n\t"
"vlef %%v18,48(%%r1,%3),2 \n\t"
"vlef %%v19,52(%%r1,%3),2 \n\t"
"vlef %%v18,56(%%r1,%3),3 \n\t"
"vlef %%v19,30(%%r1,%3),3 \n\t"
"vlef %%v20,64(%%r1,%3),0 \n\t"
"vlef %%v21,68(%%r1,%3),0 \n\t"
"vlef %%v20,72(%%r1,%3),1 \n\t"
"vlef %%v21,76(%%r1,%3),1 \n\t"
"vlef %%v20,80(%%r1,%3),2 \n\t"
"vlef %%v21,84(%%r1,%3),2 \n\t"
"vlef %%v20,88(%%r1,%3),3 \n\t"
"vlef %%v21,92(%%r1,%3),3 \n\t"
"vlef %%v22,96(%%r1,%3),0 \n\t"
"vlef %%v23,100(%%r1,%3),0 \n\t"
"vlef %%v22,104(%%r1,%3),1 \n\t"
"vlef %%v23,108(%%r1,%3),1 \n\t"
"vlef %%v22,112(%%r1,%3),2 \n\t"
"vlef %%v23,116(%%r1,%3),2 \n\t"
"vlef %%v22,120(%%r1,%3),3 \n\t"
"vlef %%v23,124(%%r1,%3),3 \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchsb %%v5,%%v16,%%v17 \n\t"
"vfchsb %%v6,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vfchsb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchsb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vlef %%v16,128(%%r1,%3),0 \n\t"
"vlef %%v17,132(%%r1,%3),0 \n\t"
"vlef %%v16,136(%%r1,%3),1 \n\t"
"vlef %%v17,140(%%r1,%3),1 \n\t"
"vlef %%v16,144(%%r1,%3),2 \n\t"
"vlef %%v17,148(%%r1,%3),2 \n\t"
"vlef %%v16,152(%%r1,%3),3 \n\t"
"vlef %%v17,156(%%r1,%3),3 \n\t"
"vlef %%v18,160(%%r1,%3),0 \n\t"
"vlef %%v19,164(%%r1,%3),0 \n\t"
"vlef %%v18,168(%%r1,%3),1 \n\t"
"vlef %%v19,172(%%r1,%3),1 \n\t"
"vlef %%v18,176(%%r1,%3),2 \n\t"
"vlef %%v19,180(%%r1,%3),2 \n\t"
"vlef %%v18,184(%%r1,%3),3 \n\t"
"vlef %%v19,188(%%r1,%3),3 \n\t"
"vlef %%v20,192(%%r1,%3),0 \n\t"
"vlef %%v21,196(%%r1,%3),0 \n\t"
"vlef %%v20,200(%%r1,%3),1 \n\t"
"vlef %%v21,204(%%r1,%3),1 \n\t"
"vlef %%v20,208(%%r1,%3),2 \n\t"
"vlef %%v21,212(%%r1,%3),2 \n\t"
"vlef %%v20,216(%%r1,%3),3 \n\t"
"vlef %%v21,220(%%r1,%3),3 \n\t"
"vlef %%v22,224(%%r1,%3),0 \n\t"
"vlef %%v23,228(%%r1,%3),0 \n\t"
"vlef %%v22,232(%%r1,%3),1 \n\t"
"vlef %%v23,236(%%r1,%3),1 \n\t"
"vlef %%v22,240(%%r1,%3),2 \n\t"
"vlef %%v23,244(%%r1,%3),2 \n\t"
"vlef %%v22,248(%%r1,%3),3 \n\t"
"vlef %%v23,252(%%r1,%3),3 \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchsb %%v5,%%v16,%%v17 \n\t"
"vfchsb %%v6,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vfchsb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchsb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v0,%%v3 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v2,%%v0 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"ste %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);
return iamax;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT maxf = 0;
BLASLONG max = 0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(max);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
max = icamax_kernel_32(n1, x, &maxf);
i = n1;
}
while(i < n)
{
if( CABS1(x,ix) > maxf )
{
max = i;
maxf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (max + 1);
} else {
inc_x2 = 2 * inc_x;
maxf = CABS1(x,0);
ix += inc_x2;
i++;
while(i < n)
{
if( CABS1(x,ix) > maxf )
{
max = i;
maxf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (max + 1);
}
}

319
kernel/zarch/icamin.c Normal file
View File

@ -0,0 +1,319 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
{
BLASLONG iamin;
__asm__ volatile (
"vlef %%v0,0(%3),0 \n\t"
"vlef %%v1,4(%3),0 \n\t"
"vlef %%v0,8(%3),1 \n\t"
"vlef %%v1,12(%3),1 \n\t"
"vlef %%v0,16(%3),2 \n\t"
"vlef %%v1,20(%3),2 \n\t"
"vlef %%v0,24(%3),3 \n\t"
"vlef %%v1,28(%3),3 \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vflpsb %%v1,%%v1 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,16 \n\t"
"vzero %%v4 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vlef %%v16,0(%%r1,%3),0 \n\t"
"vlef %%v17,4(%%r1,%3),0 \n\t"
"vlef %%v16,8(%%r1,%3),1 \n\t"
"vlef %%v17,12(%%r1,%3),1 \n\t"
"vlef %%v16,16(%%r1,%3),2 \n\t"
"vlef %%v17,20(%%r1,%3),2 \n\t"
"vlef %%v16,24(%%r1,%3),3 \n\t"
"vlef %%v17,28(%%r1,%3),3 \n\t"
"vlef %%v18,32(%%r1,%3),0 \n\t"
"vlef %%v19,36(%%r1,%3),0 \n\t"
"vlef %%v18,40(%%r1,%3),1 \n\t"
"vlef %%v19,44(%%r1,%3),1 \n\t"
"vlef %%v18,48(%%r1,%3),2 \n\t"
"vlef %%v19,52(%%r1,%3),2 \n\t"
"vlef %%v18,56(%%r1,%3),3 \n\t"
"vlef %%v19,30(%%r1,%3),3 \n\t"
"vlef %%v20,64(%%r1,%3),0 \n\t"
"vlef %%v21,68(%%r1,%3),0 \n\t"
"vlef %%v20,72(%%r1,%3),1 \n\t"
"vlef %%v21,76(%%r1,%3),1 \n\t"
"vlef %%v20,80(%%r1,%3),2 \n\t"
"vlef %%v21,84(%%r1,%3),2 \n\t"
"vlef %%v20,88(%%r1,%3),3 \n\t"
"vlef %%v21,92(%%r1,%3),3 \n\t"
"vlef %%v22,96(%%r1,%3),0 \n\t"
"vlef %%v23,100(%%r1,%3),0 \n\t"
"vlef %%v22,104(%%r1,%3),1 \n\t"
"vlef %%v23,108(%%r1,%3),1 \n\t"
"vlef %%v22,112(%%r1,%3),2 \n\t"
"vlef %%v23,116(%%r1,%3),2 \n\t"
"vlef %%v22,120(%%r1,%3),3 \n\t"
"vlef %%v23,124(%%r1,%3),3 \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchsb %%v5,%%v17,%%v16 \n\t"
"vfchsb %%v6,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vfchsb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchsb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vlef %%v16,128(%%r1,%3),0 \n\t"
"vlef %%v17,132(%%r1,%3),0 \n\t"
"vlef %%v16,136(%%r1,%3),1 \n\t"
"vlef %%v17,140(%%r1,%3),1 \n\t"
"vlef %%v16,144(%%r1,%3),2 \n\t"
"vlef %%v17,148(%%r1,%3),2 \n\t"
"vlef %%v16,152(%%r1,%3),3 \n\t"
"vlef %%v17,156(%%r1,%3),3 \n\t"
"vlef %%v18,160(%%r1,%3),0 \n\t"
"vlef %%v19,164(%%r1,%3),0 \n\t"
"vlef %%v18,168(%%r1,%3),1 \n\t"
"vlef %%v19,172(%%r1,%3),1 \n\t"
"vlef %%v18,176(%%r1,%3),2 \n\t"
"vlef %%v19,180(%%r1,%3),2 \n\t"
"vlef %%v18,184(%%r1,%3),3 \n\t"
"vlef %%v19,188(%%r1,%3),3 \n\t"
"vlef %%v20,192(%%r1,%3),0 \n\t"
"vlef %%v21,196(%%r1,%3),0 \n\t"
"vlef %%v20,200(%%r1,%3),1 \n\t"
"vlef %%v21,204(%%r1,%3),1 \n\t"
"vlef %%v20,208(%%r1,%3),2 \n\t"
"vlef %%v21,212(%%r1,%3),2 \n\t"
"vlef %%v20,216(%%r1,%3),3 \n\t"
"vlef %%v21,220(%%r1,%3),3 \n\t"
"vlef %%v22,224(%%r1,%3),0 \n\t"
"vlef %%v23,228(%%r1,%3),0 \n\t"
"vlef %%v22,232(%%r1,%3),1 \n\t"
"vlef %%v23,236(%%r1,%3),1 \n\t"
"vlef %%v22,240(%%r1,%3),2 \n\t"
"vlef %%v23,244(%%r1,%3),2 \n\t"
"vlef %%v22,248(%%r1,%3),3 \n\t"
"vlef %%v23,252(%%r1,%3),3 \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchsb %%v5,%%v17,%%v16 \n\t"
"vfchsb %%v6,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vfchsb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchsb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v3,%%v0 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v0,%%v2 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"ste %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);
return iamin;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT minf = 0;
BLASLONG min = 0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(min);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
min = icamin_kernel_32(n1, x, &minf);
i = n1;
}
while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (min + 1);
} else {
inc_x2 = 2 * inc_x;
minf = CABS1(x,0);
ix += inc_x2;
i++;
while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (min + 1);
}
}

View File

@ -23,164 +23,173 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
/**
* Find maximum index
* Warning: requirements n>0 and n % 32 == 0
* @param n
* @param x pointer to the vector
* @param maxf (out) maximum absolute value .( only for output )
* @return index
*/
static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
BLASLONG index;
__asm__(
"pfd 1, 0(%[ptr_x]) \n\t"
"sllg %%r0,%[n],3 \n\t"
"agr %%r0,%[ptr_x] \n\t"
"vleig %%v20,0,0 \n\t"
"vleig %%v20,1,1 \n\t"
"vleig %%v21,2,0 \n\t"
"vleig %%v21,3,1 \n\t"
"vleig %%v22,4,0 \n\t"
"vleig %%v22,5,1 \n\t"
"vleig %%v23,6,0 \n\t"
"vleig %%v23,7,1 \n\t"
"vrepig %%v4,8 \n\t"
"vzero %%v5 \n\t"
"vzero %%v18 \n\t"
"vzero %%v19 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%[ptr_tmp] ) \n\t"
"vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfchdb %%v16,%%v25,%%v24 \n\t "
"vfchdb %%v17,%%v27,%%v26 \n\t "
"vsel %%v1,%%v21,%%v20,%%v16 \n\t"
"vsel %%v0,%%v25,%%v24,%%v16 \n\t"
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
"vfchdb %%v16,%%v29,%%v28 \n\t "
"vfchdb %%v17,%%v31,%%v30 \n\t"
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"
static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
{
BLASLONG iamax;
"vfchdb %%v28, %%v3,%%v0 \n\t"
"vfchdb %%v29,%%v27, %%v25 \n\t"
"vsel %%v1,%%v2,%%v1,%%v28 \n\t"
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
"vag %%v1,%%v1,%%v5 \n\t"
"vag %%v24,%%v24,%%v5 \n\t"
"vag %%v24,%%v24,%%v4 \n\t"
"vfchdb %%v16,%%v25 , %%v0 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
"vfchdb %%v17, %%v29,%%v18 \n\t"
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfchdb %%v16,%%v25,%%v24 \n\t "
"vfchdb %%v17,%%v27,%%v26 \n\t "
"vsel %%v1,%%v21,%%v20,%%v16 \n\t"
"vsel %%v0,%%v25,%%v24,%%v16 \n\t"
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
"vfchdb %%v16,%%v29,%%v28 \n\t "
"vfchdb %%v17,%%v31,%%v30 \n\t"
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,16 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"vleig %%v28,8,0 \n\t"
"vleig %%v28,9,1 \n\t"
"vleig %%v29,10,0 \n\t"
"vleig %%v29,11,1 \n\t"
"vleig %%v30,12,0 \n\t"
"vleig %%v30,13,1 \n\t"
"vleig %%v31,14,0 \n\t"
"vleig %%v31,15,1 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vfchdb %%v28, %%v3,%%v0 \n\t"
"vfchdb %%v29,%%v27, %%v25 \n\t"
"vsel %%v1,%%v2,%%v1,%%v28 \n\t"
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
"vag %%v1,%%v1,%%v5 \n\t"
"vag %%v24,%%v24,%%v5 \n\t"
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
"vag %%v24,%%v24,%%v4 \n\t"
"vfchdb %%v16,%%v25 , %%v0 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
"vfchdb %%v17, %%v29,%%v18 \n\t"
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v4,%%v16,%%v17 \n\t"
"vfchdb %%v5,%%v18,%%v19 \n\t"
"vfchdb %%v6,%%v20,%%v21 \n\t"
"vfchdb %%v7,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vrepg %%v26,%%v18,1 \n\t"
"vrepg %%v5,%%v19,1 \n\t"
"wfcdb %%v26,%%v18 \n\t"
"jne 2f \n\t"
"vsteg %%v18,%[maxf],0 \n\t"
"vmnlg %%v1,%%v5,%%v19 \n\t"
"j 3f \n\t"
"vfchdb %%v20,%%v16,%%v17 \n\t"
"vfchdb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"2: \n\t"
"wfchdb %%v16,%%v26,%%v18 \n\t"
"vsel %%v1,%%v5,%%v19,%%v16 \n\t"
"vsel %%v0,%%v26,%%v18,%%v16 \n\t"
"std %%f0,%[maxf] \n\t"
"3: \n\t"
"vlgvg %[index],%%v1,0 \n\t"
: [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x)
: [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x)
: "cc", "r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return index;
"vfchdb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchdb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v4,%%v16,%%v17 \n\t"
"vfchdb %%v5,%%v18,%%v19 \n\t"
"vfchdb %%v6,%%v20,%%v21 \n\t"
"vfchdb %%v7,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vfchdb %%v20,%%v16,%%v17 \n\t"
"vfchdb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"vfchdb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchdb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v2,%%v0 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"std %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return iamax;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
BLASLONG ix = 0;
FLOAT maxf = 0.0;
BLASLONG max = 0;
@ -191,7 +200,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
max = diamax_kernel_32_TUNED(n1, x, &maxf);
max = idamax_kernel_32(n1, x, &maxf);
i = n1;
}

View File

@ -23,192 +23,185 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
/**
* Find minimum index
* Warning: requirements n>0 and n % 32 == 0
* @param n
* @param x pointer to the vector
* @param minf (out) minimum absolute value .( only for output )
* @return minimum index
*/
static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
BLASLONG index;
__asm__(
"pfd 1, 0(%[ptr_x]) \n\t"
"sllg %%r0,%[n],3 \n\t"
"agr %%r0,%[ptr_x] \n\t"
"vleig %%v20,0,0 \n\t"
"vleig %%v20,1,1 \n\t"
"vleig %%v21,2,0 \n\t"
"vleig %%v21,3,1 \n\t"
"vleig %%v22,4,0 \n\t"
"vleig %%v22,5,1 \n\t"
"vleig %%v23,6,0 \n\t"
"vleig %%v23,7,1 \n\t"
"vrepig %%v4,8 \n\t"
"vlrepg %%v18,0(%[ptr_x]) \n\t"
"vzero %%v5 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vzero %%v19 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%[ptr_tmp] ) \n\t"
"vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t"
static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
{
BLASLONG iamin;
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,16 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"vleig %%v28,8,0 \n\t"
"vleig %%v28,9,1 \n\t"
"vleig %%v29,10,0 \n\t"
"vleig %%v29,11,1 \n\t"
"vleig %%v30,12,0 \n\t"
"vleig %%v30,13,1 \n\t"
"vleig %%v31,14,0 \n\t"
"vleig %%v31,15,1 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vfchdb %%v16,%%v24,%%v25 \n\t "
"vfchdb %%v17,%%v26 ,%%v27 \n\t "
"vsel %%v1,%%v21,%%v20,%%v16 \n\t"
"vsel %%v0,%%v25,%%v24,%%v16 \n\t"
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
"vfchdb %%v16,%%v28, %%v29 \n\t "
"vfchdb %%v17,%%v30,%%v31 \n\t"
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v4,%%v17,%%v16 \n\t"
"vfchdb %%v5,%%v19,%%v18 \n\t"
"vfchdb %%v6,%%v21,%%v20 \n\t"
"vfchdb %%v7,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vfchdb %%v20,%%v17,%%v16 \n\t"
"vfchdb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"vfchdb %%v28,%%v0 , %%v3 \n\t"
"vfchdb %%v29, %%v25,%%v27 \n\t"
"vsel %%v1,%%v2,%%v1,%%v28 \n\t"
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
"vfchdb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vag %%v1,%%v1,%%v5 \n\t"
"vag %%v24,%%v24,%%v5 \n\t"
"vag %%v24,%%v24,%%v4 \n\t"
"vfchdb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"vfchdb %%v16, %%v0,%%v25 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v17,%%v18, %%v29 \n\t"
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
"vfchdb %%v4,%%v17,%%v16 \n\t"
"vfchdb %%v5,%%v19,%%v18 \n\t"
"vfchdb %%v6,%%v21,%%v20 \n\t"
"vfchdb %%v7,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vfchdb %%v20,%%v17,%%v16 \n\t"
"vfchdb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfchdb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchdb %%v16,%%v24,%%v25 \n\t"
"vfchdb %%v17,%%v26 ,%%v27 \n\t"
"vsel %%v1,%%v21,%%v20,%%v16 \n\t"
"vsel %%v0,%%v25,%%v24,%%v16 \n\t"
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
"vfchdb %%v16,%%v28 ,%%v29 \n\t"
"vfchdb %%v17,%%v30,%%v31 \n\t"
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"
"vfchdb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vfchdb %%v28,%%v0 , %%v3 \n\t"
"vfchdb %%v29, %%v25,%%v27 \n\t"
"vsel %%v1,%%v2,%%v1,%%v28 \n\t"
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
"vag %%v1,%%v1,%%v5 \n\t"
"vag %%v24,%%v24,%%v5 \n\t"
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
"vag %%v24,%%v24,%%v4 \n\t"
"vfchdb %%v16, %%v0,%%v25 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
"vfchdb %%v17,%%v18, %%v29 \n\t"
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
"vrepg %%v26,%%v18,1 \n\t"
"vrepg %%v5,%%v19,1 \n\t"
"wfcdb %%v26,%%v18 \n\t"
"jne 2f \n\t"
"vsteg %%v18,%[minf],0 \n\t"
"vmnlg %%v1,%%v5,%%v19 \n\t"
"j 3f \n\t"
"2: \n\t"
"wfchdb %%v16,%%v18 ,%%v26 \n\t "
"vsel %%v1,%%v5,%%v19,%%v16 \n\t"
"vsel %%v0,%%v26,%%v18,%%v16 \n\t"
"std %%f0,%[minf] \n\t"
"3: \n\t"
"vlgvg %[index],%%v1,0 \n\t"
: [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x)
: [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x)
: "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return index;
"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v0,%%v2 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"std %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return iamin;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
BLASLONG ix = 0;
BLASLONG min = 0;
FLOAT minf = 0.0;
BLASLONG min = 0;
if (n <= 0 || inc_x <= 0) return (min);
minf = ABS(x[0]); //index's not incremented,though it will make first comparision redundant
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
min = diamin_kernel_32(n1, x, &minf);
min = idamin_kernel_32(n1, x, &minf);
i = n1;
}

232
kernel/zarch/idmax.c Normal file
View File

@ -0,0 +1,232 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max)
{
BLASLONG imax;
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,16 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"vleig %%v28,8,0 \n\t"
"vleig %%v28,9,1 \n\t"
"vleig %%v29,10,0 \n\t"
"vleig %%v29,11,1 \n\t"
"vleig %%v30,12,0 \n\t"
"vleig %%v30,13,1 \n\t"
"vleig %%v31,14,0 \n\t"
"vleig %%v31,15,1 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchdb %%v4,%%v16,%%v17 \n\t"
"vfchdb %%v5,%%v18,%%v19 \n\t"
"vfchdb %%v6,%%v20,%%v21 \n\t"
"vfchdb %%v7,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vfchdb %%v20,%%v16,%%v17 \n\t"
"vfchdb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"vfchdb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchdb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vfchdb %%v4,%%v16,%%v17 \n\t"
"vfchdb %%v5,%%v18,%%v19 \n\t"
"vfchdb %%v6,%%v20,%%v21 \n\t"
"vfchdb %%v7,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vfchdb %%v20,%%v16,%%v17 \n\t"
"vfchdb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"vfchdb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchdb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v2,%%v0 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"std %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(imax),"=m"(*max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return imax;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
BLASLONG max = 0;
if (n <= 0 || inc_x <= 0) return (max);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
max = idmax_kernel_32(n1, x, &maxf);
i = n1;
}
while (i < n) {
if (x[i] > maxf) {
max = i;
maxf = x[i];
}
i++;
}
return (max + 1);
} else {
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
max = j + 1;
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
max = j + 2;
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
max = j + 3;
maxf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
i += inc_x;
j++;
}
return (max + 1);
}
}

232
kernel/zarch/idmin.c Normal file
View File

@ -0,0 +1,232 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min)
{
BLASLONG imin;
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,16 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"vleig %%v28,8,0 \n\t"
"vleig %%v28,9,1 \n\t"
"vleig %%v29,10,0 \n\t"
"vleig %%v29,11,1 \n\t"
"vleig %%v30,12,0 \n\t"
"vleig %%v30,13,1 \n\t"
"vleig %%v31,14,0 \n\t"
"vleig %%v31,15,1 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchdb %%v4,%%v17,%%v16 \n\t"
"vfchdb %%v5,%%v19,%%v18 \n\t"
"vfchdb %%v6,%%v21,%%v20 \n\t"
"vfchdb %%v7,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vfchdb %%v20,%%v17,%%v16 \n\t"
"vfchdb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"vfchdb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchdb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vfchdb %%v4,%%v17,%%v16 \n\t"
"vfchdb %%v5,%%v19,%%v18 \n\t"
"vfchdb %%v6,%%v21,%%v20 \n\t"
"vfchdb %%v7,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vfchdb %%v20,%%v17,%%v16 \n\t"
"vfchdb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"vfchdb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchdb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v0,%%v2 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"std %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(imin),"=m"(*min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return imin;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
BLASLONG min = 0;
if (n <= 0 || inc_x <= 0) return (min);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
min = idmin_kernel_32(n1, x, &minf);
i = n1;
}
while (i < n) {
if (x[i] < minf) {
min = i;
minf = x[i];
}
i++;
}
return (min + 1);
} else {
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] < minf) {
min = j;
minf = x[i];
}
if (x[i + inc_x] < minf) {
min = j + 1;
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
min = j + 2;
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
min = j + 3;
minf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] < minf) {
min = j;
minf = x[i];
}
i += inc_x;
j++;
}
return (min + 1);
}
}

299
kernel/zarch/isamax.c Normal file
View File

@ -0,0 +1,299 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
{
BLASLONG iamax;
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,32 \n\t"
"vzero %%v4 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"vleif %%v28,16,0 \n\t"
"vleif %%v28,17,1 \n\t"
"vleif %%v28,18,2 \n\t"
"vleif %%v28,19,3 \n\t"
"vleif %%v29,20,0 \n\t"
"vleif %%v29,21,1 \n\t"
"vleif %%v29,22,2 \n\t"
"vleif %%v29,23,3 \n\t"
"vleif %%v30,24,0 \n\t"
"vleif %%v30,25,1 \n\t"
"vleif %%v30,26,2 \n\t"
"vleif %%v30,27,3 \n\t"
"vleif %%v31,28,0 \n\t"
"vleif %%v31,29,1 \n\t"
"vleif %%v31,30,2 \n\t"
"vleif %%v31,31,3 \n\t"
"srlg %%r0,%2,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchsb %%v5,%%v16,%%v17 \n\t"
"vfchsb %%v6,%%v18,%%v19 \n\t"
"vfchsb %%v7,%%v20,%%v21 \n\t"
"vfchsb %%v8,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchsb %%v20,%%v16,%%v17 \n\t"
"vfchsb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchsb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchsb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchsb %%v5,%%v16,%%v17 \n\t"
"vfchsb %%v6,%%v18,%%v19 \n\t"
"vfchsb %%v7,%%v20,%%v21 \n\t"
"vfchsb %%v8,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchsb %%v20,%%v16,%%v17 \n\t"
"vfchsb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchsb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchsb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v0,%%v3 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v2,%%v0 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"ste %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return iamax;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
BLASLONG max = 0;
if (n <= 0 || inc_x <= 0) return (max);
if (inc_x == 1) {
BLASLONG n1 = n & -64;
if (n1 > 0) {
max = isamax_kernel_64(n1, x, &maxf);
i = n1;
}
while (i < n) {
if (ABS(x[i]) > maxf) {
max = i;
maxf = ABS(x[i]);
}
i++;
}
return (max + 1);
} else {
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
max = j + 1;
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
max = j + 2;
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
max = j + 3;
maxf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (max + 1);
}
}

299
kernel/zarch/isamin.c Normal file
View File

@ -0,0 +1,299 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
{
BLASLONG iamin;
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,32 \n\t"
"vzero %%v4 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"vleif %%v28,16,0 \n\t"
"vleif %%v28,17,1 \n\t"
"vleif %%v28,18,2 \n\t"
"vleif %%v28,19,3 \n\t"
"vleif %%v29,20,0 \n\t"
"vleif %%v29,21,1 \n\t"
"vleif %%v29,22,2 \n\t"
"vleif %%v29,23,3 \n\t"
"vleif %%v30,24,0 \n\t"
"vleif %%v30,25,1 \n\t"
"vleif %%v30,26,2 \n\t"
"vleif %%v30,27,3 \n\t"
"vleif %%v31,28,0 \n\t"
"vleif %%v31,29,1 \n\t"
"vleif %%v31,30,2 \n\t"
"vleif %%v31,31,3 \n\t"
"srlg %%r0,%2,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchsb %%v5,%%v17,%%v16 \n\t"
"vfchsb %%v6,%%v19,%%v18 \n\t"
"vfchsb %%v7,%%v21,%%v20 \n\t"
"vfchsb %%v8,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchsb %%v20,%%v17,%%v16 \n\t"
"vfchsb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchsb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchsb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchsb %%v5,%%v17,%%v16 \n\t"
"vfchsb %%v6,%%v19,%%v18 \n\t"
"vfchsb %%v7,%%v21,%%v20 \n\t"
"vfchsb %%v8,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchsb %%v20,%%v17,%%v16 \n\t"
"vfchsb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchsb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchsb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v3,%%v0 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v0,%%v2 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"ste %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return iamin;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
BLASLONG min = 0;
if (n <= 0 || inc_x <= 0) return (min);
if (inc_x == 1) {
BLASLONG n1 = n & -64;
if (n1 > 0) {
min = isamin_kernel_64(n1, x, &minf);
i = n1;
}
while (i < n) {
if (ABS(x[i]) < minf) {
min = i;
minf = ABS(x[i]);
}
i++;
}
return (min + 1);
} else {
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
min = j + 1;
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
min = j + 2;
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
min = j + 3;
minf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (min + 1);
}
}

275
kernel/zarch/ismax.c Normal file
View File

@ -0,0 +1,275 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
{
BLASLONG imax;
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,32 \n\t"
"vzero %%v4 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"vleif %%v28,16,0 \n\t"
"vleif %%v28,17,1 \n\t"
"vleif %%v28,18,2 \n\t"
"vleif %%v28,19,3 \n\t"
"vleif %%v29,20,0 \n\t"
"vleif %%v29,21,1 \n\t"
"vleif %%v29,22,2 \n\t"
"vleif %%v29,23,3 \n\t"
"vleif %%v30,24,0 \n\t"
"vleif %%v30,25,1 \n\t"
"vleif %%v30,26,2 \n\t"
"vleif %%v30,27,3 \n\t"
"vleif %%v31,28,0 \n\t"
"vleif %%v31,29,1 \n\t"
"vleif %%v31,30,2 \n\t"
"vleif %%v31,31,3 \n\t"
"srlg %%r0,%2,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchsb %%v5,%%v16,%%v17 \n\t"
"vfchsb %%v6,%%v18,%%v19 \n\t"
"vfchsb %%v7,%%v20,%%v21 \n\t"
"vfchsb %%v8,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchsb %%v20,%%v16,%%v17 \n\t"
"vfchsb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchsb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchsb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vfchsb %%v5,%%v16,%%v17 \n\t"
"vfchsb %%v6,%%v18,%%v19 \n\t"
"vfchsb %%v7,%%v20,%%v21 \n\t"
"vfchsb %%v8,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchsb %%v20,%%v16,%%v17 \n\t"
"vfchsb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchsb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchsb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v0,%%v3 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v2,%%v0 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"ste %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(imax),"=m"(*max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return imax;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
BLASLONG max = 0;
if (n <= 0 || inc_x <= 0) return (max);
if (inc_x == 1) {
BLASLONG n1 = n & -64;
if (n1 > 0) {
max = ismax_kernel_64(n1, x, &maxf);
i = n1;
}
while (i < n) {
if (x[i] > maxf) {
max = i;
maxf = x[i];
}
i++;
}
return (max + 1);
} else {
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
max = j + 1;
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
max = j + 2;
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
max = j + 3;
maxf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
i += inc_x;
j++;
}
return (max + 1);
}
}

275
kernel/zarch/ismin.c Normal file
View File

@ -0,0 +1,275 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
{
BLASLONG imin;
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,32 \n\t"
"vzero %%v4 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"vleif %%v28,16,0 \n\t"
"vleif %%v28,17,1 \n\t"
"vleif %%v28,18,2 \n\t"
"vleif %%v28,19,3 \n\t"
"vleif %%v29,20,0 \n\t"
"vleif %%v29,21,1 \n\t"
"vleif %%v29,22,2 \n\t"
"vleif %%v29,23,3 \n\t"
"vleif %%v30,24,0 \n\t"
"vleif %%v30,25,1 \n\t"
"vleif %%v30,26,2 \n\t"
"vleif %%v30,27,3 \n\t"
"vleif %%v31,28,0 \n\t"
"vleif %%v31,29,1 \n\t"
"vleif %%v31,30,2 \n\t"
"vleif %%v31,31,3 \n\t"
"srlg %%r0,%2,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchsb %%v5,%%v17,%%v16 \n\t"
"vfchsb %%v6,%%v19,%%v18 \n\t"
"vfchsb %%v7,%%v21,%%v20 \n\t"
"vfchsb %%v8,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchsb %%v20,%%v17,%%v16 \n\t"
"vfchsb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchsb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchsb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vfchsb %%v5,%%v17,%%v16 \n\t"
"vfchsb %%v6,%%v19,%%v18 \n\t"
"vfchsb %%v7,%%v21,%%v20 \n\t"
"vfchsb %%v8,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchsb %%v20,%%v17,%%v16 \n\t"
"vfchsb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchsb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchsb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v3,%%v0 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v0,%%v2 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"ste %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(imin),"=m"(*min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return imin;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
BLASLONG min = 0;
if (n <= 0 || inc_x <= 0) return (min);
if (inc_x == 1) {
BLASLONG n1 = n & -64;
if (n1 > 0) {
min = ismin_kernel_64(n1, x, &minf);
i = n1;
}
while (i < n) {
if (x[i] < minf) {
min = i;
minf = x[i];
}
i++;
}
return (min + 1);
} else {
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] < minf) {
min = j;
minf = x[i];
}
if (x[i + inc_x] < minf) {
min = j + 1;
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
min = j + 2;
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
min = j + 3;
minf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] < minf) {
min = j;
minf = x[i];
}
i += inc_x;
j++;
}
return (min + 1);
}
}

View File

@ -24,190 +24,165 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#define ABS fabs
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax)
{
BLASLONG iamax;
__asm__ volatile (
"vleg %%v0,0(%3),0 \n\t"
"vleg %%v1,8(%3),0 \n\t"
"vleg %%v0,16(%3),1 \n\t"
"vleg %%v1,24(%3),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v1,%%v1 \n\t"
"vfadb %%v0,%%v0,%%v1 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,8 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"srlg %%r0,%2,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
/**
* Find maximum index
* Warning: requirements n>0 and n % 16 == 0
* @param n
* @param x pointer to the vector
* @param maxf (out) maximum absolute value .( only for output )
* @return index
*/
static BLASLONG ziamax_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
BLASLONG index;
__asm__(
"pfd 1, 0(%[ptr_x]) \n\t"
"vleig %%v16,0,0 \n\t"
"vleig %%v16,1,1 \n\t"
"vleig %%v17,2,0 \n\t"
"vleig %%v17,3,1 \n\t"
"vleig %%v18,4,0 \n\t"
"vleig %%v18,5,1 \n\t"
"vleig %%v19,6,0 \n\t"
"vleig %%v19,7,1 \n\t"
"vleig %%v20,8,0 \n\t"
"vleig %%v20,9,1 \n\t"
"vleig %%v21,10,0 \n\t"
"vleig %%v21,11,1 \n\t"
"vleig %%v22,12,0 \n\t"
"vleig %%v22,13,1 \n\t"
"vleig %%v23,14,0 \n\t"
"vleig %%v23,15,1 \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[ptr_x] \n\t"
"vzero %%v6 \n\t"
"vzero %%v7 \n\t"
"vrepig %%v4,16 \n\t"
"vzero %%v5 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%[ptr_tmp] ) \n\t"
"vleg %%v16,0(%%r1,%3),0 \n\t"
"vleg %%v17,8(%%r1,%3),0 \n\t"
"vleg %%v16,16(%%r1,%3),1 \n\t"
"vleg %%v17,24(%%r1,%3),1 \n\t"
"vleg %%v18,32(%%r1,%3),0 \n\t"
"vleg %%v19,40(%%r1,%3),0 \n\t"
"vleg %%v18,48(%%r1,%3),1 \n\t"
"vleg %%v19,56(%%r1,%3),1 \n\t"
"vleg %%v20,64(%%r1,%3),0 \n\t"
"vleg %%v21,72(%%r1,%3),0 \n\t"
"vleg %%v20,80(%%r1,%3),1 \n\t"
"vleg %%v21,88(%%r1,%3),1 \n\t"
"vleg %%v22,96(%%r1,%3),0 \n\t"
"vleg %%v23,104(%%r1,%3),0 \n\t"
"vleg %%v22,112(%%r1,%3),1 \n\t"
"vleg %%v23,120(%%r1,%3),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vleg %%v24 , 0(%[ptr_tmp]),0 \n\t"
"vleg %%v25 , 8(%[ptr_tmp]),0 \n\t"
"vleg %%v24 , 16(%[ptr_tmp]),1 \n\t"
"vleg %%v25 , 24(%[ptr_tmp]),1 \n\t"
"vleg %%v26 , 32(%[ptr_tmp]),0 \n\t"
"vleg %%v27 , 40(%[ptr_tmp]),0 \n\t"
"vleg %%v26 , 48(%[ptr_tmp]),1 \n\t"
"vleg %%v27 , 56(%[ptr_tmp]),1 \n\t"
"vleg %%v28 , 64(%[ptr_tmp]),0 \n\t"
"vleg %%v29 , 72(%[ptr_tmp]),0 \n\t"
"vleg %%v28 , 80(%[ptr_tmp]),1 \n\t"
"vleg %%v29 , 88(%[ptr_tmp]),1 \n\t"
"vleg %%v30 , 96(%[ptr_tmp]),0 \n\t"
"vleg %%v31 ,104(%[ptr_tmp]),0 \n\t"
"vleg %%v30 ,112(%[ptr_tmp]),1 \n\t"
"vleg %%v31 ,120(%[ptr_tmp]),1 \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v0,%%v24,%%v25 \n\t"
"vfadb %%v1,%%v26,%%v27 \n\t"
"vfadb %%v2,%%v28,%%v29 \n\t"
"vfadb %%v3,%%v30,%%v31 \n\t"
"vleg %%v24 , 128(%[ptr_tmp]),0 \n\t"
"vleg %%v25 , 136(%[ptr_tmp]),0 \n\t"
"vleg %%v24 , 144(%[ptr_tmp]),1 \n\t"
"vleg %%v25 , 152(%[ptr_tmp]),1 \n\t"
"vleg %%v26 , 160(%[ptr_tmp]),0 \n\t"
"vleg %%v27 , 168(%[ptr_tmp]),0 \n\t"
"vleg %%v26 , 176(%[ptr_tmp]),1 \n\t"
"vleg %%v27 , 184(%[ptr_tmp]),1 \n\t"
"vleg %%v28 , 192(%[ptr_tmp]),0 \n\t"
"vleg %%v29 , 200(%[ptr_tmp]),0 \n\t"
"vleg %%v28 , 208(%[ptr_tmp]),1 \n\t"
"vleg %%v29 , 216(%[ptr_tmp]),1 \n\t"
"vleg %%v30 , 224(%[ptr_tmp]),0 \n\t"
"vleg %%v31 , 232(%[ptr_tmp]),0 \n\t"
"vleg %%v30 , 240(%[ptr_tmp]),1 \n\t"
"vleg %%v31 , 248(%[ptr_tmp]),1 \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v24,%%v24,%%v25 \n\t"
"vfadb %%v26,%%v26,%%v27 \n\t"
"vfadb %%v28,%%v28,%%v29 \n\t"
"vfadb %%v30,%%v30,%%v31 \n\t"
"vfchdb %%v25,%%v1,%%v0 \n\t"
"vsel %%v29,%%v17,%%v16,%%v25 \n\t"
"vsel %%v31,%%v1,%%v0,%%v25 \n\t"
"vfchdb %%v27,%%v3,%%v2 \n\t "
"vsel %%v0,%%v19,%%v18,%%v27 \n\t"
"vsel %%v1,%%v3,%%v2,%%v27 \n\t"
"vfchdb %%v25,%%v26,%%v24 \n\t"
"vsel %%v2,%%v21,%%v20,%%v25 \n\t"
"vsel %%v3,%%v26,%%v24,%%v25 \n\t"
"vfchdb %%v27,%%v30,%%v28 \n\t"
"vsel %%v25,%%v23,%%v22,%%v27 \n\t"
"vsel %%v27,%%v30,%%v28,%%v27 \n\t"
"vfchdb %%v24, %%v1,%%v31 \n\t"
"vsel %%v26,%%v0,%%v29,%%v24 \n\t"
"vsel %%v28,%%v1,%%v31,%%v24 \n\t"
"vfchdb %%v30, %%v27,%%v3 \n\t"
"vsel %%v29,%%v25,%%v2,%%v30 \n\t"
"vsel %%v31,%%v27,%%v3 ,%%v30 \n\t"
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
"vfchdb %%v0, %%v31,%%v28 \n\t"
"vsel %%v25,%%v29,%%v26,%%v0 \n\t"
"vsel %%v27,%%v31,%%v28,%%v0 \n\t"
"vag %%v25,%%v25,%%v5 \n\t"
//cmp with previous
"vfchdb %%v30, %%v27,%%v6 \n\t"
"vsel %%v7,%%v25,%%v7,%%v30 \n\t"
"vsel %%v6,%%v27,%%v6,%%v30 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
"vfchdb %%v4,%%v16,%%v17 \n\t"
"vfchdb %%v5,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
//xtract index
"vrepg %%v26,%%v6,1 \n\t"
"vrepg %%v5,%%v7,1 \n\t"
"wfcdb %%v26,%%v6 \n\t"
"jne 2f \n\t"
"vsteg %%v6,%[maxf],0 \n\t"
"vmnlg %%v1,%%v5,%%v7 \n\t"
"vlgvg %[index],%%v1,0 \n\t"
"j 3 \n\t"
"2: \n\t"
"wfchdb %%v16,%%v26,%%v6 \n\t"
"vsel %%v1,%%v5,%%v7,%%v16 \n\t"
"vsel %%v0,%%v26,%%v6,%%v16 \n\t"
"vlgvg %[index],%%v1,0 \n\t"
"std %%f0,%[maxf] \n\t"
"3: \n\t"
: [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x)
: [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x)
: "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
"vfchdb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
);
return index;
"vfchdb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"vleg %%v16,128(%%r1,%3),0 \n\t"
"vleg %%v17,136(%%r1,%3),0 \n\t"
"vleg %%v16,144(%%r1,%3),1 \n\t"
"vleg %%v17,152(%%r1,%3),1 \n\t"
"vleg %%v18,160(%%r1,%3),0 \n\t"
"vleg %%v19,168(%%r1,%3),0 \n\t"
"vleg %%v18,176(%%r1,%3),1 \n\t"
"vleg %%v19,184(%%r1,%3),1 \n\t"
"vleg %%v20,192(%%r1,%3),0 \n\t"
"vleg %%v21,200(%%r1,%3),0 \n\t"
"vleg %%v20,208(%%r1,%3),1 \n\t"
"vleg %%v21,216(%%r1,%3),1 \n\t"
"vleg %%v22,224(%%r1,%3),0 \n\t"
"vleg %%v23,232(%%r1,%3),0 \n\t"
"vleg %%v22,240(%%r1,%3),1 \n\t"
"vleg %%v23,248(%%r1,%3),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v4,%%v16,%%v17 \n\t"
"vfchdb %%v5,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vfchdb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchdb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v2,%%v0 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"std %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);
return iamax;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i = 0;
@ -223,9 +198,9 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG n1 = n & -16;
if (n1 > 0) {
max = ziamax_kernel_16_TUNED(n1, x, &maxf);
max = izamax_kernel_16(n1, x, &maxf);
i = n1;
ix = n1 << 1;
}
while(i < n)
@ -260,7 +235,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
}
return (max + 1);
}
}

View File

@ -24,253 +24,217 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#define ABS fabs
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
/**
* Find minimum index
* Warning: requirements n>0 and n % 16 == 0
* @param n
* @param x pointer to the vector
* @param minf (out) minimum absolute value .( only for output )
* @return minimum index
*/
static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
BLASLONG index ;
__asm__(
"pfd 1, 0(%[ptr_x]) \n\t"
"vleig %%v16,0,0 \n\t"
"vleig %%v16,1,1 \n\t"
"vleig %%v17,2,0 \n\t"
"vleig %%v17,3,1 \n\t"
"vleig %%v18,4,0 \n\t"
"vleig %%v18,5,1 \n\t"
"vleig %%v19,6,0 \n\t"
"vleig %%v19,7,1 \n\t"
"vleig %%v20,8,0 \n\t"
"vleig %%v20,9,1 \n\t"
"vleig %%v21,10,0 \n\t"
"vleig %%v21,11,1 \n\t"
"vleig %%v22,12,0 \n\t"
"vleig %%v22,13,1 \n\t"
"vleig %%v23,14,0 \n\t"
"vleig %%v23,15,1 \n\t"
"ld %%f6,0(%[ptr_x]) \n\t"
"lpdbr %%f6,%%f6 \n\t"
"ld %%f7,8(%[ptr_x]) \n\t"
"lpdbr %%f7,%%f7 \n\t"
"adbr %%f6,%%f7 \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[ptr_x] \n\t"
"vrepg %%v6,%%v6,0 \n\t"
"vzero %%v7 \n\t"
"vrepig %%v4,16 \n\t"
"vzero %%v5 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%[ptr_tmp] ) \n\t"
static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin)
{
BLASLONG iamin;
__asm__ volatile (
"vleg %%v0,0(%3),0 \n\t"
"vleg %%v1,8(%3),0 \n\t"
"vleg %%v0,16(%3),1 \n\t"
"vleg %%v1,24(%3),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v1,%%v1 \n\t"
"vfadb %%v0,%%v0,%%v1 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,8 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"srlg %%r0,%2,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vleg %%v16,0(%%r1,%3),0 \n\t"
"vleg %%v17,8(%%r1,%3),0 \n\t"
"vleg %%v16,16(%%r1,%3),1 \n\t"
"vleg %%v17,24(%%r1,%3),1 \n\t"
"vleg %%v18,32(%%r1,%3),0 \n\t"
"vleg %%v19,40(%%r1,%3),0 \n\t"
"vleg %%v18,48(%%r1,%3),1 \n\t"
"vleg %%v19,56(%%r1,%3),1 \n\t"
"vleg %%v20,64(%%r1,%3),0 \n\t"
"vleg %%v21,72(%%r1,%3),0 \n\t"
"vleg %%v20,80(%%r1,%3),1 \n\t"
"vleg %%v21,88(%%r1,%3),1 \n\t"
"vleg %%v22,96(%%r1,%3),0 \n\t"
"vleg %%v23,104(%%r1,%3),0 \n\t"
"vleg %%v22,112(%%r1,%3),1 \n\t"
"vleg %%v23,120(%%r1,%3),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vleg %%v24 , 0(%[ptr_tmp]),0 \n\t"
"vleg %%v25 , 8(%[ptr_tmp]),0 \n\t"
"vleg %%v24 , 16(%[ptr_tmp]),1 \n\t"
"vleg %%v25 , 24(%[ptr_tmp]),1 \n\t"
"vleg %%v26 , 32(%[ptr_tmp]),0 \n\t"
"vleg %%v27 , 40(%[ptr_tmp]),0 \n\t"
"vleg %%v26 , 48(%[ptr_tmp]),1 \n\t"
"vleg %%v27 , 56(%[ptr_tmp]),1 \n\t"
"vleg %%v28 , 64(%[ptr_tmp]),0 \n\t"
"vleg %%v29 , 72(%[ptr_tmp]),0 \n\t"
"vleg %%v28 , 80(%[ptr_tmp]),1 \n\t"
"vleg %%v29 , 88(%[ptr_tmp]),1 \n\t"
"vleg %%v30 , 96(%[ptr_tmp]),0 \n\t"
"vleg %%v31 ,104(%[ptr_tmp]),0 \n\t"
"vleg %%v30 ,112(%[ptr_tmp]),1 \n\t"
"vleg %%v31 ,120(%[ptr_tmp]),1 \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v0,%%v24,%%v25 \n\t"
"vfadb %%v1,%%v26,%%v27 \n\t"
"vfadb %%v2,%%v28,%%v29 \n\t"
"vfadb %%v3,%%v30,%%v31 \n\t"
"vleg %%v24 ,128(%[ptr_tmp]),0 \n\t"
"vleg %%v25 ,136(%[ptr_tmp]),0 \n\t"
"vleg %%v24 ,144(%[ptr_tmp]),1 \n\t"
"vleg %%v25 ,152(%[ptr_tmp]),1 \n\t"
"vleg %%v26 ,160(%[ptr_tmp]),0 \n\t"
"vleg %%v27 ,168(%[ptr_tmp]),0 \n\t"
"vleg %%v26 ,176(%[ptr_tmp]),1 \n\t"
"vleg %%v27 ,184(%[ptr_tmp]),1 \n\t"
"vleg %%v28 ,192(%[ptr_tmp]),0 \n\t"
"vleg %%v29 ,200(%[ptr_tmp]),0 \n\t"
"vleg %%v28 ,208(%[ptr_tmp]),1 \n\t"
"vleg %%v29 ,216(%[ptr_tmp]),1 \n\t"
"vleg %%v30 ,224(%[ptr_tmp]),0 \n\t"
"vleg %%v31 ,232(%[ptr_tmp]),0 \n\t"
"vleg %%v30 ,240(%[ptr_tmp]),1 \n\t"
"vleg %%v31 ,248(%[ptr_tmp]),1 \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v24,%%v24,%%v25 \n\t"
"vfadb %%v26,%%v26,%%v27 \n\t"
"vfadb %%v28,%%v28,%%v29 \n\t"
"vfadb %%v30,%%v30,%%v31 \n\t"
"vfchdb %%v25,%%v0 ,%%v1 \n\t"
"vsel %%v29,%%v17,%%v16,%%v25 \n\t"
"vsel %%v31,%%v1,%%v0,%%v25 \n\t"
"vfchdb %%v27,%%v2,%%v3 \n\t"
"vsel %%v0,%%v19,%%v18,%%v27 \n\t"
"vsel %%v1,%%v3,%%v2,%%v27 \n\t"
"vfchdb %%v25,%%v24,%%v26 \n\t"
"vsel %%v2,%%v21,%%v20,%%v25 \n\t"
"vsel %%v3,%%v26,%%v24,%%v25 \n\t"
"vfchdb %%v27,%%v28,%%v30 \n\t"
"vsel %%v25,%%v23,%%v22,%%v27 \n\t"
"vsel %%v27,%%v30,%%v28,%%v27 \n\t"
"vfchdb %%v24,%%v31, %%v1 \n\t"
"vsel %%v26,%%v0,%%v29,%%v24 \n\t"
"vsel %%v28,%%v1,%%v31,%%v24 \n\t"
"vfchdb %%v30,%%v3, %%v27 \n\t"
"vsel %%v29,%%v25,%%v2,%%v30 \n\t"
"vsel %%v31,%%v27,%%v3 ,%%v30 \n\t"
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
"vfchdb %%v0,%%v28, %%v31 \n\t"
"vsel %%v25,%%v29,%%v26,%%v0 \n\t"
"vsel %%v27,%%v31,%%v28,%%v0 \n\t"
"vag %%v25,%%v25,%%v5 \n\t"
//cmp with previous
"vfchdb %%v30,%%v6 , %%v27 \n\t"
"vsel %%v7,%%v25,%%v7,%%v30 \n\t"
"vsel %%v6,%%v27,%%v6,%%v30 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
"vfchdb %%v4,%%v17,%%v16 \n\t"
"vfchdb %%v5,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
//xtract index
"vrepg %%v26,%%v6,1 \n\t"
"vrepg %%v5,%%v7,1 \n\t"
"wfcdb %%v26,%%v6 \n\t"
"jne 2f \n\t"
"vsteg %%v6,%[minf],0 \n\t"
"vmnlg %%v1,%%v5,%%v7 \n\t"
"vlgvg %[index],%%v1,0 \n\t"
"j 3f \n\t"
"2: \n\t"
"wfchdb %%v16,%%v6 ,%%v26 \n\t"
"vsel %%v1,%%v5,%%v7,%%v16 \n\t"
"vsel %%v0,%%v26,%%v6,%%v16 \n\t"
"vlgvg %[index],%%v1,0 \n\t"
"std %%f0,%[minf] \n\t"
"3: \n\t"
"vfchdb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
: [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x)
: [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x)
: "cc","r0","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
"vfchdb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
);
"vleg %%v16,128(%%r1,%3),0 \n\t"
"vleg %%v17,136(%%r1,%3),0 \n\t"
"vleg %%v16,144(%%r1,%3),1 \n\t"
"vleg %%v17,152(%%r1,%3),1 \n\t"
"vleg %%v18,160(%%r1,%3),0 \n\t"
"vleg %%v19,168(%%r1,%3),0 \n\t"
"vleg %%v18,176(%%r1,%3),1 \n\t"
"vleg %%v19,184(%%r1,%3),1 \n\t"
"vleg %%v20,192(%%r1,%3),0 \n\t"
"vleg %%v21,200(%%r1,%3),0 \n\t"
"vleg %%v20,208(%%r1,%3),1 \n\t"
"vleg %%v21,216(%%r1,%3),1 \n\t"
"vleg %%v22,224(%%r1,%3),0 \n\t"
"vleg %%v23,232(%%r1,%3),0 \n\t"
"vleg %%v22,240(%%r1,%3),1 \n\t"
"vleg %%v23,248(%%r1,%3),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v4,%%v17,%%v16 \n\t"
"vfchdb %%v5,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
return index;
"vfchdb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchdb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v0,%%v2 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"std %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);
return iamin;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf;
BLASLONG min=0;
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT minf = 0;
BLASLONG min = 0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(min);
if (inc_x == 1) {
BLASLONG n1 = n & -16;
if (n1 > 0) {
BLASLONG n1 = n & -16;
if (n1 > 0) {
min = izamin_kernel_16(n1, x, &minf);
min = ziamin_kernel_16_TUNED(n1, x, &minf);
i = n1;
ix = n1 << 1;
}
else {
//assign minf
minf = CABS1(x,0);
ix += 2;
i++;
}
}
while(i < n)
while(i < n)
{
if( CABS1(x,ix) < minf )
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += 2;
i++;
min = i;
minf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (min + 1);
} else {
inc_x2 = 2 * inc_x;
inc_x2 = 2 * inc_x;
minf = CABS1(x,0);
minf = CABS1(x,0);
ix += inc_x2;
i++;
while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;
while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
}
return (min + 1);
}
}

210
kernel/zarch/samax.c Normal file
View File

@ -0,0 +1,210 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x)
{
FLOAT amax;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"vflpsb %%v0,%%v0 \n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchsb %%v24,%%v16,%%v17 \n\t"
"vfchsb %%v25,%%v18,%%v19 \n\t"
"vfchsb %%v26,%%v20,%%v21 \n\t"
"vfchsb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchsb %%v28,%%v24,%%v25 \n\t"
"vfchsb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchsb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchsb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchsb %%v24,%%v16,%%v17 \n\t"
"vfchsb %%v25,%%v18,%%v19 \n\t"
"vfchsb %%v26,%%v20,%%v21 \n\t"
"vfchsb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchsb %%v28,%%v24,%%v25 \n\t"
"vfchsb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchsb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchsb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v16,%%v0,32 \n\t"
"vfchsb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"vrepf %%v16,%%v0,2 \n\t"
"wfchsb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ler %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amax;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf);
if (inc_x == 1) {
BLASLONG n1 = n & -64;
if (n1 > 0) {
maxf = samax_kernel_64(n1, x);
i = n1;
}
else
{
maxf=ABS(x[0]);
i++;
}
while (i < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i++;
}
return (maxf);
} else {
maxf=ABS(x[0]);
i += inc_x;
j++;
BLASLONG n1 = (n - 1) & -4;
while (j < n1) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
maxf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (maxf);
}
}

210
kernel/zarch/samin.c Normal file
View File

@ -0,0 +1,210 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x)
{
FLOAT amin;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"vflpsb %%v0,%%v0 \n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchsb %%v24,%%v17,%%v16 \n\t"
"vfchsb %%v25,%%v19,%%v18 \n\t"
"vfchsb %%v26,%%v21,%%v20 \n\t"
"vfchsb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchsb %%v28,%%v25,%%v24 \n\t"
"vfchsb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchsb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchsb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchsb %%v24,%%v17,%%v16 \n\t"
"vfchsb %%v25,%%v19,%%v18 \n\t"
"vfchsb %%v26,%%v21,%%v20 \n\t"
"vfchsb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchsb %%v28,%%v25,%%v24 \n\t"
"vfchsb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchsb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchsb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v16,%%v0,32 \n\t"
"vfchsb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"vrepf %%v16,%%v0,2 \n\t"
"wfchsb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ler %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amin;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf);
if (inc_x == 1) {
BLASLONG n1 = n & -64;
if (n1 > 0) {
minf = samin_kernel_64(n1, x);
i = n1;
}
else
{
minf=ABS(x[0]);
i++;
}
while (i < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i++;
}
return (minf);
} else {
minf=ABS(x[0]);
i += inc_x;
j++;
BLASLONG n1 = (n - 1) & -4;
while (j < n1) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
minf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (minf);
}
}

174
kernel/zarch/sasum.c Normal file
View File

@ -0,0 +1,174 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x)
{
FLOAT asum;
__asm__ (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vfasb %%v0,%%v0,%%v2 \n\t"
"vfasb %%v0,%%v0,%%v3 \n\t"
"veslg %%v1,%%v0,32 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vrepf %%v1,%%v0,2 \n\t"
"aebr %%f0,%%f1 \n\t"
"ler %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);
return asum;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT sumf = 0.0;
BLASLONG n1;
if (n <= 0 || inc_x <= 0) return sumf;
if (inc_x == 1) {
n1 = n & -64;
if (n1 > 0) {
sumf = sasum_kernel_64(n1, x);
i = n1;
}
while (i < n) {
sumf += ABS(x[i]);
i++;
}
} else {
BLASLONG n1 = n & -4;
register FLOAT sum1, sum2;
sum1 = 0.0;
sum2 = 0.0;
while (j < n1) {
sum1 += ABS(x[i]);
sum2 += ABS(x[i + inc_x]);
sum1 += ABS(x[i + 2 * inc_x]);
sum2 += ABS(x[i + 3 * inc_x]);
i += inc_x * 4;
j += 4;
}
sumf = sum1 + sum2;
while (j < n) {
sumf += ABS(x[i]);
i += inc_x;
j++;
}
}
return sumf;
}

184
kernel/zarch/saxpy.c Normal file
View File

@ -0,0 +1,184 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
"vlrepf %%v0,%3 \n\t"
"srlg %%r0,%0,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,0(%%r1,%2) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,32(%%r1,%2) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"vfmasb %%v16,%%v0,%%v16,%%v20 \n\t"
"vfmasb %%v17,%%v0,%%v17,%%v21 \n\t"
"vfmasb %%v18,%%v0,%%v18,%%v22 \n\t"
"vfmasb %%v19,%%v0,%%v19,%%v23 \n\t"
"vl %%v24,64(%%r1,%1) \n\t"
"vl %%v25,80(%%r1,%1) \n\t"
"vl %%v26,96(%%r1,%1) \n\t"
"vl %%v27,112(%%r1,%1) \n\t"
"vl %%v28,64(%%r1,%2) \n\t"
"vl %%v29,80(%%r1,%2) \n\t"
"vl %%v30,96(%%r1,%2) \n\t"
"vl %%v31,112(%%r1,%2) \n\t"
"vfmasb %%v20,%%v0,%%v24,%%v28 \n\t"
"vfmasb %%v21,%%v0,%%v25,%%v29 \n\t"
"vfmasb %%v22,%%v0,%%v26,%%v30 \n\t"
"vfmasb %%v23,%%v0,%%v27,%%v31 \n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"vl %%v16,128(%%r1,%1) \n\t"
"vl %%v17,144(%%r1,%1) \n\t"
"vl %%v18,160(%%r1,%1) \n\t"
"vl %%v19,176(%%r1,%1) \n\t"
"vl %%v20,128(%%r1,%2) \n\t"
"vl %%v21,144(%%r1,%2) \n\t"
"vl %%v22,160(%%r1,%2) \n\t"
"vl %%v23,176(%%r1,%2) \n\t"
"vfmasb %%v16,%%v0,%%v16,%%v20 \n\t"
"vfmasb %%v17,%%v0,%%v17,%%v21 \n\t"
"vfmasb %%v18,%%v0,%%v18,%%v22 \n\t"
"vfmasb %%v19,%%v0,%%v19,%%v23 \n\t"
"vl %%v24,192(%%r1,%1) \n\t"
"vl %%v25,208(%%r1,%1) \n\t"
"vl %%v26,224(%%r1,%1) \n\t"
"vl %%v27,240(%%r1,%1) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmasb %%v20,%%v0,%%v24,%%v28 \n\t"
"vfmasb %%v21,%%v0,%%v25,%%v29 \n\t"
"vfmasb %%v22,%%v0,%%v26,%%v30 \n\t"
"vfmasb %%v23,%%v0,%%v27,%%v31 \n\t"
"vst %%v16,128(%%r1,%2) \n\t"
"vst %%v17,144(%%r1,%2) \n\t"
"vst %%v18,160(%%r1,%2) \n\t"
"vst %%v19,176(%%r1,%2) \n\t"
"vst %%v20,192(%%r1,%2) \n\t"
"vst %%v21,208(%%r1,%2) \n\t"
"vst %%v22,224(%%r1,%2) \n\t"
"vst %%v23,240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n <= 0 ) return 0 ;
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -64;
if ( n1 )
saxpy_kernel_64(n1, x, y , &da);
i = n1;
while(i < n)
{
y[i] += da * x[i] ;
i++ ;
}
return 0 ;
}
BLASLONG n1 = n & -4;
while(i < n1)
{
FLOAT m1 = da * x[ix] ;
FLOAT m2 = da * x[ix+inc_x] ;
FLOAT m3 = da * x[ix+2*inc_x] ;
FLOAT m4 = da * x[ix+3*inc_x] ;
y[iy] += m1 ;
y[iy+inc_y] += m2 ;
y[iy+2*inc_y] += m3 ;
y[iy+3*inc_y] += m4 ;
ix += inc_x*4 ;
iy += inc_y*4 ;
i+=4 ;
}
while(i < n)
{
y[iy] += da * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return 0 ;
}

85
kernel/zarch/scopy.c Normal file
View File

@ -0,0 +1,85 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,6 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1) \n\t"
"pfd 2, 1024(%%r2) \n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t"
"agfi %%r1,256 \n\t"
"agfi %%r2,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","r2"
);
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
if (n <= 0) return 0;
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -64;
if (n1 > 0) {
scopy_kernel_64(n1, x, y);
i = n1;
}
while (i < n) {
y[i] = x[i];
i++;
}
} else {
while (i < n) {
y[iy] = x[ix];
ix += inc_x;
iy += inc_y;
i++;
}
}
return 0;
}

140
kernel/zarch/sdot.c Normal file
View File

@ -0,0 +1,140 @@
/***************************************************************************
Copyright (c) 2013-2018,The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms,with or without
modification,are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice,this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice,this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL
DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
FLOAT dot;
__asm__ volatile (
"vzero %%v0 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,0(%%r1,%3) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,16(%%r1,%3) \n\t"
"vfmasb %%v0,%%v17,%%v25,%%v0 \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vfmasb %%v0,%%v18,%%v26,%%v0 \n\t"
"vl %%v27,48(%%r1,%3) \n\t"
"vfmasb %%v0,%%v19,%%v27,%%v0 \n\t"
"vl %%v28,64(%%r1,%3) \n\t"
"vfmasb %%v0,%%v20,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%3) \n\t"
"vfmasb %%v0,%%v21,%%v29,%%v0 \n\t"
"vl %%v30,96(%%r1,%3) \n\t"
"vfmasb %%v0,%%v22,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vfmasb %%v0,%%v23,%%v31,%%v0 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"vrepf %%v1,%%v0,1 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepf %%v3,%%v0,3 \n\t"
"aebr %%f0,%%f1 \n\t"
"aebr %%f0,%%f2 \n\t"
"aebr %%f0,%%f3 \n\t"
"ler %0,%%f0 "
:"=f"(dot)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return dot;
}
FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT dot = 0.0 ;
if ( n <= 0 ) return(dot);
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -32;
if ( n1 )
dot = sdot_kernel_32(n1,x,y);
i = n1;
while(i < n)
{
dot += y[i] * x[i] ;
i++ ;
}
return(dot);
}
BLASLONG n1 = n & -2;
while(i < n1)
{
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
ix += inc_x*2 ;
iy += inc_y*2 ;
i+=2 ;
}
while(i < n)
{
dot += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(dot);
}

668
kernel/zarch/sgemv_n_4.c Normal file
View File

@ -0,0 +1,668 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#define NBMAX 2048
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vlrepf %%v0,0(%5) \n\t"
"vlrepf %%v1,4(%5) \n\t"
"vlrepf %%v2,8(%5) \n\t"
"vlrepf %%v3,12(%5) \n\t"
"vlrepf %%v4,%7 \n\t"
"vfmsb %%v0,%%v0,%%v4 \n\t"
"vfmsb %%v1,%%v1,%%v4 \n\t"
"vfmsb %%v2,%%v2,%%v4 \n\t"
"vfmsb %%v3,%%v3,%%v4 \n\t"
"xgr %%r1,%%r1 \n\t"
"lghi %%r0,-32 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
"srlg %%r0,%%r0,5 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%4) \n\t"
"pfd 2,1024(%%r1,%6) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,0(%%r1,%3) \n\t"
"vl %%v19,0(%%r1,%4) \n\t"
"vl %%v20,16(%%r1,%1) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,16(%%r1,%3) \n\t"
"vl %%v23,16(%%r1,%4) \n\t"
"vl %%v24,32(%%r1,%1) \n\t"
"vl %%v25,32(%%r1,%2) \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vl %%v27,32(%%r1,%4) \n\t"
"vl %%v28,48(%%r1,%1) \n\t"
"vl %%v29,48(%%r1,%2) \n\t"
"vl %%v30,48(%%r1,%3) \n\t"
"vl %%v31,48(%%r1,%4) \n\t"
"vl %%v4,0(%%r1,%6) \n\t"
"vfmasb %%v4,%%v16,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v17,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v18,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v19,%%v3,%%v4 \n\t"
"vst %%v4,0(%%r1,%6) \n\t"
"vl %%v4,16(%%r1,%6) \n\t"
"vfmasb %%v4,%%v20,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v21,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v22,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v23,%%v3,%%v4 \n\t"
"vst %%v4,16(%%r1,%6) \n\t"
"vl %%v4,32(%%r1,%6) \n\t"
"vfmasb %%v4,%%v24,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v25,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v26,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v27,%%v3,%%v4 \n\t"
"vst %%v4,32(%%r1,%6) \n\t"
"vl %%v4,48(%%r1,%6) \n\t"
"vfmasb %%v4,%%v28,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v29,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v30,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v31,%%v3,%%v4 \n\t"
"vst %%v4,48(%%r1,%6) \n\t"
"vl %%v16,64(%%r1,%1) \n\t"
"vl %%v17,64(%%r1,%2) \n\t"
"vl %%v18,64(%%r1,%3) \n\t"
"vl %%v19,64(%%r1,%4) \n\t"
"vl %%v20,80(%%r1,%1) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,80(%%r1,%3) \n\t"
"vl %%v23,80(%%r1,%4) \n\t"
"vl %%v24,96(%%r1,%1) \n\t"
"vl %%v25,96(%%r1,%2) \n\t"
"vl %%v26,96(%%r1,%3) \n\t"
"vl %%v27,96(%%r1,%4) \n\t"
"vl %%v28,112(%%r1,%1) \n\t"
"vl %%v29,112(%%r1,%2) \n\t"
"vl %%v30,112(%%r1,%3) \n\t"
"vl %%v31,112(%%r1,%4) \n\t"
"vl %%v4,64(%%r1,%6) \n\t"
"vfmasb %%v4,%%v16,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v17,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v18,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v19,%%v3,%%v4 \n\t"
"vst %%v4,64(%%r1,%6) \n\t"
"vl %%v4,80(%%r1,%6) \n\t"
"vfmasb %%v4,%%v20,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v21,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v22,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v23,%%v3,%%v4 \n\t"
"vst %%v4,80(%%r1,%6) \n\t"
"vl %%v4,96(%%r1,%6) \n\t"
"vfmasb %%v4,%%v24,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v25,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v26,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v27,%%v3,%%v4 \n\t"
"vst %%v4,96(%%r1,%6) \n\t"
"vl %%v4,112(%%r1,%6) \n\t"
"vfmasb %%v4,%%v28,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v29,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v30,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v31,%%v3,%%v4 \n\t"
"vst %%v4,112(%%r1,%6) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,28 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,0(%%r1,%3) \n\t"
"vl %%v19,0(%%r1,%4) \n\t"
"vl %%v4,0(%%r1,%6) \n\t"
"vfmasb %%v4,%%v16,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v17,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v18,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v19,%%v3,%%v4 \n\t"
"vst %%v4,0(%%r1,%6) \n\t"
"agfi %%r1,16 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"nop "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vlrepf %%v0,0(%3) \n\t"
"vlrepf %%v1,4(%3) \n\t"
"vlrepf %%v2,%5 \n\t"
"vfmsb %%v0,%%v0,%%v2 \n\t"
"vfmsb %%v1,%%v1,%%v2 \n\t"
"xgr %%r1,%%r1 \n\t"
"lghi %%r0,-32 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
"srlg %%r0,%%r0,5 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 2,1024(%%r1,%4) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,16(%%r1,%1) \n\t"
"vl %%v19,16(%%r1,%2) \n\t"
"vl %%v20,32(%%r1,%1) \n\t"
"vl %%v21,32(%%r1,%2) \n\t"
"vl %%v22,48(%%r1,%1) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"vl %%v24,64(%%r1,%1) \n\t"
"vl %%v25,64(%%r1,%2) \n\t"
"vl %%v26,80(%%r1,%1) \n\t"
"vl %%v27,80(%%r1,%2) \n\t"
"vl %%v28,96(%%r1,%1) \n\t"
"vl %%v29,96(%%r1,%2) \n\t"
"vl %%v30,112(%%r1,%1) \n\t"
"vl %%v31,112(%%r1,%2) \n\t"
"vl %%v2,0(%%r1,%4) \n\t"
"vfmasb %%v2,%%v16,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v17,%%v1,%%v2 \n\t"
"vst %%v2,0(%%r1,%4) \n\t"
"vl %%v2,16(%%r1,%4) \n\t"
"vfmasb %%v2,%%v18,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v19,%%v1,%%v2 \n\t"
"vst %%v2,16(%%r1,%4) \n\t"
"vl %%v2,32(%%r1,%4) \n\t"
"vfmasb %%v2,%%v20,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v21,%%v1,%%v2 \n\t"
"vst %%v2,32(%%r1,%4) \n\t"
"vl %%v2,48(%%r1,%4) \n\t"
"vfmasb %%v2,%%v22,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v23,%%v1,%%v2 \n\t"
"vst %%v2,48(%%r1,%4) \n\t"
"vl %%v2,64(%%r1,%4) \n\t"
"vfmasb %%v2,%%v24,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v25,%%v1,%%v2 \n\t"
"vst %%v2,64(%%r1,%4) \n\t"
"vl %%v2,80(%%r1,%4) \n\t"
"vfmasb %%v2,%%v26,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v27,%%v1,%%v2 \n\t"
"vst %%v2,80(%%r1,%4) \n\t"
"vl %%v2,96(%%r1,%4) \n\t"
"vfmasb %%v2,%%v28,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v29,%%v1,%%v2 \n\t"
"vst %%v2,96(%%r1,%4) \n\t"
"vl %%v2,112(%%r1,%4) \n\t"
"vfmasb %%v2,%%v30,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v31,%%v1,%%v2 \n\t"
"vst %%v2,112(%%r1,%4) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,28 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v2,0(%%r1,%4) \n\t"
"vfmasb %%v2,%%v16,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v17,%%v1,%%v2 \n\t"
"vst %%v2,0(%%r1,%4) \n\t"
"agfi %%r1,16 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"nop "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vlrepf %%v0,0(%2) \n\t"
"vlrepf %%v1,%4 \n\t"
"vfmsb %%v0,%%v0,%%v1 \n\t"
"xgr %%r1,%%r1 \n\t"
"lghi %%r0,-32 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
"srlg %%r0,%%r0,5 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,64(%%r1,%1) \n\t"
"vl %%v21,80(%%r1,%1) \n\t"
"vl %%v22,96(%%r1,%1) \n\t"
"vl %%v23,112(%%r1,%1) \n\t"
"vl %%v1,0(%%r1,%3) \n\t"
"vfmasb %%v1,%%v16,%%v0,%%v1 \n\t"
"vst %%v1,0(%%r1,%3) \n\t"
"vl %%v1,16(%%r1,%3) \n\t"
"vfmasb %%v1,%%v17,%%v0,%%v1 \n\t"
"vst %%v1,16(%%r1,%3) \n\t"
"vl %%v1,32(%%r1,%3) \n\t"
"vfmasb %%v1,%%v18,%%v0,%%v1 \n\t"
"vst %%v1,32(%%r1,%3) \n\t"
"vl %%v1,48(%%r1,%3) \n\t"
"vfmasb %%v1,%%v19,%%v0,%%v1 \n\t"
"vst %%v1,48(%%r1,%3) \n\t"
"vl %%v1,64(%%r1,%3) \n\t"
"vfmasb %%v1,%%v20,%%v0,%%v1 \n\t"
"vst %%v1,64(%%r1,%3) \n\t"
"vl %%v1,80(%%r1,%3) \n\t"
"vfmasb %%v1,%%v21,%%v0,%%v1 \n\t"
"vst %%v1,80(%%r1,%3) \n\t"
"vl %%v1,96(%%r1,%3) \n\t"
"vfmasb %%v1,%%v22,%%v0,%%v1 \n\t"
"vst %%v1,96(%%r1,%3) \n\t"
"vl %%v1,112(%%r1,%3) \n\t"
"vfmasb %%v1,%%v23,%%v0,%%v1 \n\t"
"vst %%v1,112(%%r1,%3) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,28 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v1,0(%%r1,%3) \n\t"
"vfmasb %%v1,%%v16,%%v0,%%v1 \n\t"
"vst %%v1,0(%%r1,%3) \n\t"
"agfi %%r1,16 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"nop "
:
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
BLASLONG i;
for (i = 0; i < n; i++)
{
*dest += src[i];
dest += inc_dest;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[4];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4 = lda << 2;
FLOAT xbuffer[8],*ybuffer;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
ybuffer = buffer;
n1 = n >> 2 ;
n2 = n & 3 ;
m3 = m & 3 ;
m1 = m & -4 ;
m2 = (m & (NBMAX-1)) - m3 ;
y_ptr = y;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
a_ptr = a;
x_ptr = x;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
if ( inc_y != 1 )
memset(ybuffer,0,NB*8);
else
ybuffer = y_ptr;
if ( inc_x == 1 )
{
for( i = 0; i < n1 ; i++)
{
sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
x_ptr += 4;
}
if ( n2 & 2 )
{
sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
a_ptr += lda*2;
x_ptr += 2;
}
if ( n2 & 1 )
{
sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
a_ptr += lda;
x_ptr += 1;
}
}
else
{
for( i = 0; i < n1 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
xbuffer[1] = x_ptr[0];
x_ptr += inc_x;
xbuffer[2] = x_ptr[0];
x_ptr += inc_x;
xbuffer[3] = x_ptr[0];
x_ptr += inc_x;
sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
}
for( i = 0; i < n2 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
a_ptr += lda;
}
}
a += NB;
if ( inc_y != 1 )
{
add_y(NB,ybuffer,y_ptr,inc_y);
y_ptr += NB * inc_y;
}
else
y_ptr += NB ;
}
if ( m3 == 0 ) return(0);
if ( m3 == 3 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
if ( lda == 3 && inc_x ==1 )
{
for( i = 0; i < ( n & -4 ); i+=4 )
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
a_ptr += 12;
x_ptr += 4;
}
for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += 3;
x_ptr ++;
}
}
else
{
for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
y_ptr += inc_y;
y_ptr[0] += alpha * temp2;
return(0);
}
if ( m3 == 2 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
if ( lda == 2 && inc_x ==1 )
{
for( i = 0; i < (n & -4) ; i+=4 )
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
a_ptr += 8;
x_ptr += 4;
}
for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += 2;
x_ptr ++;
}
}
else
{
for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
return(0);
}
if ( m3 == 1 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp = 0.0;
if ( lda == 1 && inc_x ==1 )
{
for( i = 0; i < (n & -4); i+=4 )
{
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
}
for( ; i < n; i++ )
{
temp += a_ptr[i] * x_ptr[i];
}
}
else
{
for( i = 0; i < n; i++ )
{
temp += a_ptr[0] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp;
return(0);
}
return(0);
}

826
kernel/zarch/sgemv_t_4.c Normal file
View File

@ -0,0 +1,826 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#define NBMAX 2048
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"xgr %%r1,%%r1 \n\t"
"lghi %%r0,-32 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
"srlg %%r0,%%r0,5 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%4) \n\t"
"pfd 1,1024(%%r1,%5) \n\t"
"vl %%v16,0(%%r1,%5) \n\t"
"vl %%v17,16(%%r1,%5) \n\t"
"vl %%v18,32(%%r1,%5) \n\t"
"vl %%v19,48(%%r1,%5) \n\t"
"vl %%v20,64(%%r1,%5) \n\t"
"vl %%v21,80(%%r1,%5) \n\t"
"vl %%v22,96(%%r1,%5) \n\t"
"vl %%v23,112(%%r1,%5) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmasb %%v1,%%v16,%%v25,%%v1 \n\t"
"vl %%v26,0(%%r1,%3) \n\t"
"vfmasb %%v2,%%v16,%%v26,%%v2 \n\t"
"vl %%v27,0(%%r1,%4) \n\t"
"vfmasb %%v3,%%v16,%%v27,%%v3 \n\t"
"vl %%v28,16(%%r1,%1) \n\t"
"vfmasb %%v0,%%v17,%%v28,%%v0 \n\t"
"vl %%v29,16(%%r1,%2) \n\t"
"vfmasb %%v1,%%v17,%%v29,%%v1 \n\t"
"vl %%v30,16(%%r1,%3) \n\t"
"vfmasb %%v2,%%v17,%%v30,%%v2 \n\t"
"vl %%v31,16(%%r1,%4) \n\t"
"vfmasb %%v3,%%v17,%%v31,%%v3 \n\t"
"vl %%v24,32(%%r1,%1) \n\t"
"vfmasb %%v0,%%v18,%%v24,%%v0 \n\t"
"vl %%v25,32(%%r1,%2) \n\t"
"vfmasb %%v1,%%v18,%%v25,%%v1 \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vfmasb %%v2,%%v18,%%v26,%%v2 \n\t"
"vl %%v27,32(%%r1,%4) \n\t"
"vfmasb %%v3,%%v18,%%v27,%%v3 \n\t"
"vl %%v28,48(%%r1,%1) \n\t"
"vfmasb %%v0,%%v19,%%v28,%%v0 \n\t"
"vl %%v29,48(%%r1,%2) \n\t"
"vfmasb %%v1,%%v19,%%v29,%%v1 \n\t"
"vl %%v30,48(%%r1,%3) \n\t"
"vfmasb %%v2,%%v19,%%v30,%%v2 \n\t"
"vl %%v31,48(%%r1,%4) \n\t"
"vfmasb %%v3,%%v19,%%v31,%%v3 \n\t"
"vl %%v24,64(%%r1,%1) \n\t"
"vfmasb %%v0,%%v20,%%v24,%%v0 \n\t"
"vl %%v25,64(%%r1,%2) \n\t"
"vfmasb %%v1,%%v20,%%v25,%%v1 \n\t"
"vl %%v26,64(%%r1,%3) \n\t"
"vfmasb %%v2,%%v20,%%v26,%%v2 \n\t"
"vl %%v27,64(%%r1,%4) \n\t"
"vfmasb %%v3,%%v20,%%v27,%%v3 \n\t"
"vl %%v28,80(%%r1,%1) \n\t"
"vfmasb %%v0,%%v21,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%2) \n\t"
"vfmasb %%v1,%%v21,%%v29,%%v1 \n\t"
"vl %%v30,80(%%r1,%3) \n\t"
"vfmasb %%v2,%%v21,%%v30,%%v2 \n\t"
"vl %%v31,80(%%r1,%4) \n\t"
"vfmasb %%v3,%%v21,%%v31,%%v3 \n\t"
"vl %%v24,96(%%r1,%1) \n\t"
"vfmasb %%v0,%%v22,%%v24,%%v0 \n\t"
"vl %%v25,96(%%r1,%2) \n\t"
"vfmasb %%v1,%%v22,%%v25,%%v1 \n\t"
"vl %%v26,96(%%r1,%3) \n\t"
"vfmasb %%v2,%%v22,%%v26,%%v2 \n\t"
"vl %%v27,96(%%r1,%4) \n\t"
"vfmasb %%v3,%%v22,%%v27,%%v3 \n\t"
"vl %%v28,112(%%r1,%1) \n\t"
"vfmasb %%v0,%%v23,%%v28,%%v0 \n\t"
"vl %%v29,112(%%r1,%2) \n\t"
"vfmasb %%v1,%%v23,%%v29,%%v1 \n\t"
"vl %%v30,112(%%r1,%3) \n\t"
"vfmasb %%v2,%%v23,%%v30,%%v2 \n\t"
"vl %%v31,112(%%r1,%4) \n\t"
"vfmasb %%v3,%%v23,%%v31,%%v3 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,28 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%5) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmasb %%v1,%%v16,%%v25,%%v1 \n\t"
"vl %%v26,0(%%r1,%3) \n\t"
"vfmasb %%v2,%%v16,%%v26,%%v2 \n\t"
"vl %%v27,0(%%r1,%4) \n\t"
"vfmasb %%v3,%%v16,%%v27,%%v3 \n\t"
"agfi %%r1,16 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"vrepf %%v4,%%v0,1 \n\t"
"aebr %%f0,%%f4 \n\t"
"vrepf %%v4,%%v0,2 \n\t"
"aebr %%f0,%%f4 \n\t"
"vrepf %%v4,%%v0,3 \n\t"
"aebr %%f0,%%f4 \n\t"
"ste %%f0,0(%6) \n\t"
"vrepf %%v4,%%v1,1 \n\t"
"aebr %%f1,%%f4 \n\t"
"vrepf %%v4,%%v1,2 \n\t"
"aebr %%f1,%%f4 \n\t"
"vrepf %%v4,%%v1,3 \n\t"
"aebr %%f1,%%f4 \n\t"
"ste %%f1,4(%6) \n\t"
"vrepf %%v4,%%v2,1 \n\t"
"aebr %%f2,%%f4 \n\t"
"vrepf %%v4,%%v2,2 \n\t"
"aebr %%f2,%%f4 \n\t"
"vrepf %%v4,%%v2,3 \n\t"
"aebr %%f2,%%f4 \n\t"
"ste %%f2,8(%6) \n\t"
"vrepf %%v4,%%v3,1 \n\t"
"aebr %%f3,%%f4 \n\t"
"vrepf %%v4,%%v3,2 \n\t"
"aebr %%f3,%%f4 \n\t"
"vrepf %%v4,%%v3,3 \n\t"
"aebr %%f3,%%f4 \n\t"
"ste %%f3,12(%6) "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"xgr %%r1,%%r1 \n\t"
"lghi %%r0,-32 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
"srlg %%r0,%%r0,5 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmasb %%v1,%%v16,%%v25,%%v1 \n\t"
"vl %%v26,16(%%r1,%1) \n\t"
"vfmasb %%v0,%%v17,%%v26,%%v0 \n\t"
"vl %%v27,16(%%r1,%2) \n\t"
"vfmasb %%v1,%%v17,%%v27,%%v1 \n\t"
"vl %%v28,32(%%r1,%1) \n\t"
"vfmasb %%v0,%%v18,%%v28,%%v0 \n\t"
"vl %%v29,32(%%r1,%2) \n\t"
"vfmasb %%v1,%%v18,%%v29,%%v1 \n\t"
"vl %%v30,48(%%r1,%1) \n\t"
"vfmasb %%v0,%%v19,%%v30,%%v0 \n\t"
"vl %%v31,48(%%r1,%2) \n\t"
"vfmasb %%v1,%%v19,%%v31,%%v1 \n\t"
"vl %%v24,64(%%r1,%1) \n\t"
"vfmasb %%v0,%%v20,%%v24,%%v0 \n\t"
"vl %%v25,64(%%r1,%2) \n\t"
"vfmasb %%v1,%%v20,%%v25,%%v1 \n\t"
"vl %%v26,80(%%r1,%1) \n\t"
"vfmasb %%v0,%%v21,%%v26,%%v0 \n\t"
"vl %%v27,80(%%r1,%2) \n\t"
"vfmasb %%v1,%%v21,%%v27,%%v1 \n\t"
"vl %%v28,96(%%r1,%1) \n\t"
"vfmasb %%v0,%%v22,%%v28,%%v0 \n\t"
"vl %%v29,96(%%r1,%2) \n\t"
"vfmasb %%v1,%%v22,%%v29,%%v1 \n\t"
"vl %%v30,112(%%r1,%1) \n\t"
"vfmasb %%v0,%%v23,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%2) \n\t"
"vfmasb %%v1,%%v23,%%v31,%%v1 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,28 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmasb %%v1,%%v16,%%v25,%%v1 \n\t"
"agfi %%r1,16 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"vrepf %%v2,%%v0,1 \n\t"
"aebr %%f0,%%f2 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"aebr %%f0,%%f2 \n\t"
"vrepf %%v2,%%v0,3 \n\t"
"aebr %%f0,%%f2 \n\t"
"ste %%f0,0(%4) \n\t"
"vrepf %%v2,%%v1,1 \n\t"
"aebr %%f1,%%f2 \n\t"
"vrepf %%v2,%%v1,2 \n\t"
"aebr %%f1,%%f2 \n\t"
"vrepf %%v2,%%v1,3 \n\t"
"aebr %%f1,%%f2 \n\t"
"ste %%f1,4(%4) "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"vzero %%v0 \n\t"
"xgr %%r1,%%r1 \n\t"
"lghi %%r0,-32 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
"srlg %%r0,%%r0,5 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,16(%%r1,%1) \n\t"
"vfmasb %%v0,%%v17,%%v25,%%v0 \n\t"
"vl %%v26,32(%%r1,%1) \n\t"
"vfmasb %%v0,%%v18,%%v26,%%v0 \n\t"
"vl %%v27,48(%%r1,%1) \n\t"
"vfmasb %%v0,%%v19,%%v27,%%v0 \n\t"
"vl %%v28,64(%%r1,%1) \n\t"
"vfmasb %%v0,%%v20,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%1) \n\t"
"vfmasb %%v0,%%v21,%%v29,%%v0 \n\t"
"vl %%v30,96(%%r1,%1) \n\t"
"vfmasb %%v0,%%v22,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%1) \n\t"
"vfmasb %%v0,%%v23,%%v31,%%v0 \n\t"
"1: \n\t"
"lghi %%r0,28 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
"agfi %%r1,16 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"vrepf %%v1,%%v0,1 \n\t"
"aebr %%f0,%%f1 \n\t"
"vrepf %%v1,%%v0,2 \n\t"
"aebr %%f0,%%f1 \n\t"
"vrepf %%v1,%%v0,3 \n\t"
"aebr %%f0,%%f1 \n\t"
"ste %%f0,0(%3) "
:
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
{
BLASLONG i;
for (i = 0; i < n; i++)
{
dest[i] = *src;
src += inc_src;
}
}
static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest)
{
__asm__ volatile (
"vlrepf %%v0,%1 \n\t"
"xgr %%r1,%%r1 \n\t"
"lghi %%r0,-32 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
"srlg %%r0,%%r0,5 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%3) \n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t"
"vst %%v24, 0(%%r1,%3) \n\t"
"vl %%v25, 16(%%r1,%3) \n\t"
"vfmasb %%v25,%%v17,%%v0,%%v25 \n\t"
"vst %%v25, 16(%%r1,%3) \n\t"
"vl %%v26, 32(%%r1,%3) \n\t"
"vfmasb %%v26,%%v18,%%v0,%%v26 \n\t"
"vst %%v26, 32(%%r1,%3) \n\t"
"vl %%v27, 48(%%r1,%3) \n\t"
"vfmasb %%v27,%%v19,%%v0,%%v27 \n\t"
"vst %%v27, 48(%%r1,%3) \n\t"
"vl %%v28, 64(%%r1,%3) \n\t"
"vfmasb %%v28,%%v20,%%v0,%%v28 \n\t"
"vst %%v28, 64(%%r1,%3) \n\t"
"vl %%v29, 80(%%r1,%3) \n\t"
"vfmasb %%v29,%%v21,%%v0,%%v29 \n\t"
"vst %%v29, 80(%%r1,%3) \n\t"
"vl %%v30, 96(%%r1,%3) \n\t"
"vfmasb %%v30,%%v22,%%v0,%%v30 \n\t"
"vst %%v30, 96(%%r1,%3) \n\t"
"vl %%v31, 112(%%r1,%3) \n\t"
"vfmasb %%v31,%%v23,%%v0,%%v31 \n\t"
"vst %%v31, 112(%%r1,%3) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,28 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%3) \n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t"
"vst %%v24, 0(%%r1,%3) \n\t"
"agfi %%r1,16 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"nop "
:
:"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
if (inc_dest == 1)
add_y_kernel_4(n, da, src, dest);
else
{
BLASLONG i;
for (i = 0; i < n; i++)
{
*dest += src[i] * da;
dest += inc_dest;
}
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG register i;
BLASLONG register j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
BLASLONG n0;
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
FLOAT ybuffer[2] __attribute__ ((aligned(16)));
FLOAT *xbuffer;
FLOAT *ytemp;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
xbuffer = buffer;
ytemp = buffer + (m < NBMAX ? m : NBMAX);
n0 = n / NBMAX;
n1 = (n % NBMAX) >> 2 ;
n2 = n & 3 ;
m3 = m & 3 ;
m1 = m & -4 ;
m2 = (m & (NBMAX-1)) - m3 ;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
y_ptr = y;
a_ptr = a;
x_ptr = x;
if ( inc_x == 1 )
xbuffer = x_ptr;
else
copy_x(NB,x_ptr,xbuffer,inc_x);
FLOAT *ap[4];
FLOAT *yp;
BLASLONG register lda4 = 4 * lda;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
if ( n0 > 0 )
{
BLASLONG nb1 = NBMAX / 4;
for( j=0; j<n0; j++)
{
yp = ytemp;
for( i = 0; i < nb1 ; i++)
{
sgemv_kernel_4x4(NB,ap,xbuffer,yp);
ap[0] += lda4 ;
ap[1] += lda4 ;
ap[2] += lda4 ;
ap[3] += lda4 ;
yp += 4;
}
add_y(nb1*4, alpha, ytemp, y_ptr, inc_y );
y_ptr += nb1 * inc_y * 4;
a_ptr += nb1 * lda4 ;
}
}
yp = ytemp;
for( i = 0; i < n1 ; i++)
{
sgemv_kernel_4x4(NB,ap,xbuffer,yp);
ap[0] += lda4 ;
ap[1] += lda4 ;
ap[2] += lda4 ;
ap[3] += lda4 ;
yp += 4;
}
if ( n1 > 0 )
{
add_y(n1*4, alpha, ytemp, y_ptr, inc_y );
y_ptr += n1 * inc_y * 4;
a_ptr += n1 * lda4 ;
}
if ( n2 & 2 )
{
sgemv_kernel_4x2(NB,ap,xbuffer,ybuffer);
a_ptr += lda * 2;
*y_ptr += ybuffer[0] * alpha;
y_ptr += inc_y;
*y_ptr += ybuffer[1] * alpha;
y_ptr += inc_y;
}
if ( n2 & 1 )
{
sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
a_ptr += lda;
*y_ptr += ybuffer[0] * alpha;
y_ptr += inc_y;
}
a += NB;
x += NB * inc_x;
}
if ( m3 == 0 ) return(0);
x_ptr = x;
a_ptr = a;
if ( m3 == 3 )
{
FLOAT xtemp0 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp1 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp2 = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if ( lda == 3 && inc_y == 1 )
{
for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
aj += 12;
}
for ( ; j<n; j++ )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
aj += 3;
}
}
else
{
if ( inc_y == 1 )
{
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for ( j=0; j< ( n & -4 ); j+=4 )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 + *(aj+lda+2) * xtemp2;
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2;
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2;
aj += lda4;
}
for ( ; j< n ; j++ )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ;
aj += lda;
}
}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
y_ptr += inc_y;
aj += lda;
}
}
}
return(0);
}
if ( m3 == 2 )
{
FLOAT xtemp0 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp1 = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if ( lda == 2 && inc_y == 1 )
{
for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ;
y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ;
y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ;
aj += 8;
}
for ( ; j<n; j++ )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
aj += 2;
}
}
else
{
if ( inc_y == 1 )
{
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for ( j=0; j< ( n & -4 ); j+=4 )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 ;
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ;
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ;
aj += lda4;
}
for ( ; j< n ; j++ )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
aj += lda;
}
}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
y_ptr += inc_y;
aj += lda;
}
}
}
return(0);
}
FLOAT xtemp = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if ( lda == 1 && inc_y == 1 )
{
for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[j] * xtemp;
y_ptr[j+1] += aj[j+1] * xtemp;
y_ptr[j+2] += aj[j+2] * xtemp;
y_ptr[j+3] += aj[j+3] * xtemp;
}
for ( ; j<n ; j++ )
{
y_ptr[j] += aj[j] * xtemp;
}
}
else
{
if ( inc_y == 1 )
{
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for ( j=0; j< ( n & -4 ); j+=4 )
{
y_ptr[j] += *aj * xtemp;
y_ptr[j+1] += *(aj+lda) * xtemp;
y_ptr[j+2] += *(aj+lda2) * xtemp;
y_ptr[j+3] += *(aj+lda3) * xtemp;
aj += lda4 ;
}
for ( ; j<n; j++ )
{
y_ptr[j] += *aj * xtemp;
aj += lda;
}
}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp;
y_ptr += inc_y;
aj += lda;
}
}
}
return(0);
}

186
kernel/zarch/smax.c Normal file
View File

@ -0,0 +1,186 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x)
{
FLOAT max;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfchsb %%v24,%%v16,%%v17 \n\t"
"vfchsb %%v25,%%v18,%%v19 \n\t"
"vfchsb %%v26,%%v20,%%v21 \n\t"
"vfchsb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchsb %%v28,%%v24,%%v25 \n\t"
"vfchsb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchsb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchsb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vfchsb %%v24,%%v16,%%v17 \n\t"
"vfchsb %%v25,%%v18,%%v19 \n\t"
"vfchsb %%v26,%%v20,%%v21 \n\t"
"vfchsb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchsb %%v28,%%v24,%%v25 \n\t"
"vfchsb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchsb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchsb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v16,%%v0,32 \n\t"
"vfchsb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"vrepf %%v16,%%v0,2 \n\t"
"wfchsb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ler %0,%%f0 "
:"=f"(max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return max;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf);
if (inc_x == 1) {
BLASLONG n1 = n & -64;
if (n1 > 0) {
maxf = smax_kernel_64(n1, x);
i = n1;
}
else
{
maxf=x[0];
i++;
}
while (i < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i++;
}
return (maxf);
} else {
maxf=x[0];
i += inc_x;
j++;
BLASLONG n1 = (n - 1) & -4;
while (j < n1) {
if (x[i] > maxf) {
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
maxf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i += inc_x;
j++;
}
return (maxf);
}
}

186
kernel/zarch/smin.c Normal file
View File

@ -0,0 +1,186 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x)
{
FLOAT min;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfchsb %%v24,%%v17,%%v16 \n\t"
"vfchsb %%v25,%%v19,%%v18 \n\t"
"vfchsb %%v26,%%v21,%%v20 \n\t"
"vfchsb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchsb %%v28,%%v25,%%v24 \n\t"
"vfchsb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchsb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchsb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vfchsb %%v24,%%v17,%%v16 \n\t"
"vfchsb %%v25,%%v19,%%v18 \n\t"
"vfchsb %%v26,%%v21,%%v20 \n\t"
"vfchsb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchsb %%v28,%%v25,%%v24 \n\t"
"vfchsb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchsb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchsb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v16,%%v0,32 \n\t"
"vfchsb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"vrepf %%v16,%%v0,2 \n\t"
"wfchsb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ler %0,%%f0 "
:"=f"(min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return min;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf);
if (inc_x == 1) {
BLASLONG n1 = n & -64;
if (n1 > 0) {
minf = smin_kernel_64(n1, x);
i = n1;
}
else
{
minf=x[0];
i++;
}
while (i < n) {
if (x[i] < minf) {
minf = x[i];
}
i++;
}
return (minf);
} else {
minf=x[0];
i += inc_x;
j++;
BLASLONG n1 = (n - 1) & -4;
while (j < n1) {
if (x[i] < minf) {
minf = x[i];
}
if (x[i + inc_x] < minf) {
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
minf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] < minf) {
minf = x[i];
}
i += inc_x;
j++;
}
return (minf);
}
}

246
kernel/zarch/srot.c Normal file
View File

@ -0,0 +1,246 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
__asm__ (
"vlrepf %%v0,%3 \n\t"
"vlrepf %%v1,%4 \n\t"
"srlg %%r0,%0,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%1) \n\t"
"vst %%v29, 16(%%r1,%1) \n\t"
"vst %%v30, 32(%%r1,%1) \n\t"
"vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v23, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%1) \n\t"
"vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%1) \n\t"
"vst %%v29, 144(%%r1,%1) \n\t"
"vst %%v30, 160(%%r1,%1) \n\t"
"vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v23, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -64;
if ( n1 > 0 )
{
FLOAT cosa,sina;
cosa=c;
sina=s;
srot_kernel_64(n1, x, y, &cosa, &sina);
i=n1;
}
while(i < n)
{
temp = c*x[i] + s*y[i] ;
y[i] = c*y[i] - s*x[i] ;
x[i] = temp ;
i++ ;
}
}
else
{
while(i < n)
{
temp = c*x[ix] + s*y[iy] ;
y[iy] = c*y[iy] - s*x[ix] ;
x[ix] = temp ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
}
return(0);
}

201
kernel/zarch/sscal.c Normal file
View File

@ -0,0 +1,201 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x)
{
__asm__ volatile (
"vlrepf %%v0,%1 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%2) \n\t"
"vfmsb %%v24,%%v24,%%v0 \n\t"
"vst %%v24, 0(%%r1,%2) \n\t"
"vl %%v25, 16(%%r1,%2) \n\t"
"vfmsb %%v25,%%v25,%%v0 \n\t"
"vst %%v25, 16(%%r1,%2) \n\t"
"vl %%v26, 32(%%r1,%2) \n\t"
"vfmsb %%v26,%%v26,%%v0 \n\t"
"vst %%v26, 32(%%r1,%2) \n\t"
"vl %%v27, 48(%%r1,%2) \n\t"
"vfmsb %%v27,%%v27,%%v0 \n\t"
"vst %%v27, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%2) \n\t"
"vfmsb %%v24,%%v24,%%v0 \n\t"
"vst %%v24, 64(%%r1,%2) \n\t"
"vl %%v25, 80(%%r1,%2) \n\t"
"vfmsb %%v25,%%v25,%%v0 \n\t"
"vst %%v25, 80(%%r1,%2) \n\t"
"vl %%v26, 96(%%r1,%2) \n\t"
"vfmsb %%v26,%%v26,%%v0 \n\t"
"vst %%v26, 96(%%r1,%2) \n\t"
"vl %%v27, 112(%%r1,%2) \n\t"
"vfmsb %%v27,%%v27,%%v0 \n\t"
"vst %%v27, 112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v24","v25","v26","v27"
);
}
static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"vst %%v24,0(%%r1,%1) \n\t"
"vst %%v25,16(%%r1,%1) \n\t"
"vst %%v26,32(%%r1,%1) \n\t"
"vst %%v27,48(%%r1,%1) \n\t"
"vst %%v24,64(%%r1,%1) \n\t"
"vst %%v25,80(%%r1,%1) \n\t"
"vst %%v26,96(%%r1,%1) \n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x)
:"memory","cc","r0","r1","v24","v25","v26","v27"
);
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0,j=0;
if ( n <= 0 || inc_x <=0 )
return(0);
if ( inc_x == 1 )
{
if ( da == 0.0 )
{
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
sscal_kernel_32_zero(n1, x);
j=n1;
}
while(j < n)
{
x[j]=0.0;
j++;
}
}
else
{
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
sscal_kernel_32(n1, da, x);
j=n1;
}
while(j < n)
{
x[j] = da * x[j] ;
j++;
}
}
}
else
{
if ( da == 0.0 )
{
BLASLONG n1 = n & -2;
while (j < n1) {
x[i]=0.0;
x[i + inc_x]=0.0;
i += inc_x * 2;
j += 2;
}
while(j < n)
{
x[i]=0.0;
i += inc_x ;
j++;
}
}
else
{
BLASLONG n1 = n & -2;
while (j < n1) {
x[i] = da * x[i] ;
x[i + inc_x] = da * x[i + inc_x];
i += inc_x * 2;
j += 2;
}
while(j < n)
{
x[i] = da * x[i] ;
i += inc_x ;
j++;
}
}
}
return 0;
}

164
kernel/zarch/sswap.c Normal file
View File

@ -0,0 +1,164 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"srlg %%r0,%0,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"
"vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v1, 144(%%r1,%2) \n\t"
"vl %%v2, 160(%%r1,%2) \n\t"
"vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v6, 224(%%r1,%2) \n\t"
"vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vst %%v2, 160(%%r1,%1) \n\t"
"vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vst %%v7, 240(%%r1,%1) \n\t"
"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -64;
if ( n1 > 0 )
{
sswap_kernel_64(n1, x, y);
i=n1;
}
while(i < n)
{
temp = y[i];
y[i] = x[i] ;
x[i] = temp;
i++ ;
}
}
else
{
while(i < n)
{
temp = y[iy];
y[iy] = x[ix] ;
x[ix] = temp;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
}
return(0);
}

221
kernel/zarch/zamax.c Normal file
View File

@ -0,0 +1,221 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
{
FLOAT amax;
__asm__ volatile (
"vleg %%v0,0(%2),0 \n\t"
"vleg %%v16,8(%2),0 \n\t"
"vleg %%v0,16(%2),1 \n\t"
"vleg %%v16,24(%2),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v16,%%v16 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vleg %%v16,0(%%r1,%2),0 \n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vfchdb %%v26,%%v24,%%v25 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
"vfchdb %%v27,%%v26,%%v0 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
"vleg %%v16,128(%%r1,%2),0 \n\t"
"vleg %%v17,136(%%r1,%2),0 \n\t"
"vleg %%v16,144(%%r1,%2),1 \n\t"
"vleg %%v17,152(%%r1,%2),1 \n\t"
"vleg %%v18,160(%%r1,%2),0 \n\t"
"vleg %%v19,168(%%r1,%2),0 \n\t"
"vleg %%v18,176(%%r1,%2),1 \n\t"
"vleg %%v19,184(%%r1,%2),1 \n\t"
"vleg %%v20,192(%%r1,%2),0 \n\t"
"vleg %%v21,200(%%r1,%2),0 \n\t"
"vleg %%v20,208(%%r1,%2),1 \n\t"
"vleg %%v21,216(%%r1,%2),1 \n\t"
"vleg %%v22,224(%%r1,%2),0 \n\t"
"vleg %%v23,232(%%r1,%2),0 \n\t"
"vleg %%v22,240(%%r1,%2),1 \n\t"
"vleg %%v23,248(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vfchdb %%v26,%%v24,%%v25 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
"vfchdb %%v27,%%v26,%%v0 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);
return amax;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (maxf);
if (inc_x == 1) {
BLASLONG n1 = n & -16;
if (n1 > 0) {
maxf = zamax_kernel_16(n1, x);
i = n1;
}
else
{
maxf=CABS1(x,0);
i++;
}
while (i < n) {
if (ABS(x[i*2]) > maxf) {
maxf = ABS(x[i*2]);
}
i++;
}
return (maxf);
} else {
inc_x2 = 2 * inc_x;
maxf=CABS1(x,0);
i += inc_x2;
j++;
BLASLONG n1 = (n - 1) & -4;
while (j < n1) {
if (CABS1(x,i) > maxf) {
maxf = CABS1(x,i);
}
if (CABS1(x,i+inc_x2) > maxf) {
maxf = CABS1(x,i+inc_x2);
}
if (CABS1(x,i+inc_x2*2) > maxf) {
maxf = CABS1(x,i+inc_x2*2);
}
if (CABS1(x,i+inc_x2*3) > maxf) {
maxf = CABS1(x,i+inc_x2*3);
}
i += inc_x2 * 4;
j += 4;
}
while (j < n) {
if (CABS1(x,i) > maxf) {
maxf = CABS1(x,i);
}
i += inc_x2;
j++;
}
return (maxf);
}
}

221
kernel/zarch/zamin.c Normal file
View File

@ -0,0 +1,221 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
{
FLOAT amin;
__asm__ volatile (
"vleg %%v0,0(%2),0 \n\t"
"vleg %%v16,8(%2),0 \n\t"
"vleg %%v0,16(%2),1 \n\t"
"vleg %%v16,24(%2),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v16,%%v16 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vleg %%v16,0(%%r1,%2),0 \n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vfchdb %%v26,%%v25,%%v24 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
"vfchdb %%v27,%%v0,%%v26 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
"vleg %%v16,128(%%r1,%2),0 \n\t"
"vleg %%v17,136(%%r1,%2),0 \n\t"
"vleg %%v16,144(%%r1,%2),1 \n\t"
"vleg %%v17,152(%%r1,%2),1 \n\t"
"vleg %%v18,160(%%r1,%2),0 \n\t"
"vleg %%v19,168(%%r1,%2),0 \n\t"
"vleg %%v18,176(%%r1,%2),1 \n\t"
"vleg %%v19,184(%%r1,%2),1 \n\t"
"vleg %%v20,192(%%r1,%2),0 \n\t"
"vleg %%v21,200(%%r1,%2),0 \n\t"
"vleg %%v20,208(%%r1,%2),1 \n\t"
"vleg %%v21,216(%%r1,%2),1 \n\t"
"vleg %%v22,224(%%r1,%2),0 \n\t"
"vleg %%v23,232(%%r1,%2),0 \n\t"
"vleg %%v22,240(%%r1,%2),1 \n\t"
"vleg %%v23,248(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vfchdb %%v26,%%v25,%%v24 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
"vfchdb %%v27,%%v0,%%v26 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);
return amin;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (minf);
if (inc_x == 1) {
BLASLONG n1 = n & -16;
if (n1 > 0) {
minf = zamin_kernel_16(n1, x);
i = n1;
}
else
{
minf=CABS1(x,0);
i++;
}
while (i < n) {
if (ABS(x[i*2]) < minf) {
minf = ABS(x[i*2]);
}
i++;
}
return (minf);
} else {
inc_x2 = 2 * inc_x;
minf=CABS1(x,0);
i += inc_x2;
j++;
BLASLONG n1 = (n - 1) & -4;
while (j < n1) {
if (CABS1(x,i) < minf) {
minf = CABS1(x,i);
}
if (CABS1(x,i+inc_x2) < minf) {
minf = CABS1(x,i+inc_x2);
}
if (CABS1(x,i+inc_x2*2) < minf) {
minf = CABS1(x,i+inc_x2*2);
}
if (CABS1(x,i+inc_x2*3) < minf) {
minf = CABS1(x,i+inc_x2*3);
}
i += inc_x2 * 4;
j += 4;
}
while (j < n) {
if (CABS1(x,i) < minf) {
minf = CABS1(x,i);
}
i += inc_x2;
j++;
}
return (minf);
}
}

View File

@ -25,92 +25,98 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) {
static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x)
{
FLOAT asum;
__asm__ (
"pfd 1, 0(%[ptr_x]) \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[ptr_x] \n\t"
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v22 \n\t"
"vzero %%v23 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%[ptr_tmp] ) \n\t"
"vlm %%v24,%%v31,0(%[ptr_tmp]) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v0,%%v0,%%v24 \n\t"
"vfadb %%v1,%%v1,%%v25 \n\t"
"vfadb %%v23,%%v23,%%v26 \n\t"
"vfadb %%v22,%%v22,%%v27 \n\t"
"vfadb %%v0,%%v0,%%v28 \n\t"
"vfadb %%v1,%%v1,%%v29 \n\t"
"vfadb %%v23,%%v23,%%v30 \n\t"
"vfadb %%v22,%%v22,%%v31 \n\t"
"vlm %%v24,%%v31, 128(%[ptr_tmp]) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
"vfadb %%v0,%%v0,%%v24 \n\t"
"vfadb %%v1,%%v1,%%v25 \n\t"
"vfadb %%v23,%%v23,%%v26 \n\t"
"vfadb %%v22,%%v22,%%v27 \n\t"
"vfadb %%v0,%%v0,%%v28 \n\t"
"vfadb %%v1,%%v1,%%v29 \n\t"
"vfadb %%v23,%%v23,%%v30 \n\t"
"vfadb %%v22,%%v22,%%v31 \n\t"
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
"vfadb %%v24,%%v0,%%v1 \n\t"
"vfadb %%v25,%%v23,%%v22 \n\t"
"vfadb %%v0,%%v25,%%v24 \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %[asum] ,%%f0"
: [asum] "=f"(asum),[ptr_tmp] "+&a"(x)
: [mem] "m"( *(const double (*)[2*n])x ), [n] "r"(n), [ptr_x] "a"(x)
: "cc", "r0","f0","f1","v0","v1","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vfadb %%v2,%%v2,%%v22 \n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vfadb %%v2,%%v2,%%v22 \n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b \n\t"
"vfadb %%v0,%%v0,%%v1 \n\t"
"vfadb %%v0,%%v0,%%v2 \n\t"
"vfadb %%v0,%%v0,%%v3 \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);
return asum;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
@ -128,7 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( n1 > 0 )
{
sumf=zasum_kernel_16(n1, x );
sumf = zasum_kernel_16(n1, x);
i=n1;
ip=2*n1;
}

View File

@ -23,142 +23,98 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
*****************************************************************************/
#include "common.h"
static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) {
BLASLONG tempR1 ;
__asm__ ("pfd 1, 0(%[x_tmp]) \n\t"
"pfd 2, 0(%[y_tmp]) \n\t"
static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
#if !defined(CONJ)
"lgdr %[t1],%[alpha_r] \n\t"
"vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint
"lgdr %[t1],%[alpha_i] \n\t"
"vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint
"vflcdb %%v29,%%v29 \n\t" //complement both
"vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i}
"vlrepg %%v0,0(%3) \n\t"
"vleg %%v1,8(%3),0 \n\t"
"wflcdb %%v1,%%v1 \n\t"
"vleg %%v1,8(%3),1 \n\t"
#else
"vleg %%v0,0(%3),1 \n\t"
"vflcdb %%v0,%%v0 \n\t"
"vleg %%v0,0(%3),0 \n\t"
"vlrepg %%v1,8(%3) \n\t"
#endif
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
#else
"lgdr %[t1],%[alpha_i] \n\t"
"vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint
"lgdr %[t1],%[alpha_r] \n\t"
"vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint
"vflcdb %%v28,%%v28 \n\t" //complement both
"vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r}
#endif
"xgr %[t1],%[t1] \n\t"
"sllg %[tmp],%[tmp],4 \n\t"
"vl %%v30 , 0(%[t1],%[y_tmp]) \n\t"
"vl %%v31 , 16(%[t1],%[y_tmp]) \n\t"
"vl %%v6 , 32(%[t1],%[y_tmp]) \n\t"
"vl %%v7 , 48(%[t1],%[y_tmp]) \n\t"
"vl %%v20 , 0(%[t1],%[x_tmp]) \n\t"
"vl %%v21 , 16(%[t1],%[x_tmp]) \n\t"
"vl %%v22 , 32(%[t1],%[x_tmp]) \n\t"
"vl %%v23 , 48(%[t1],%[x_tmp]) \n\t"
"lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition
"j 2f \n\t"
".align 16 \n\t"
"1: \n\t"
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
"vfmadb %%v16, %%v20, %%v28, %%v16 \n\t"
"vfmadb %%v17, %%v21, %%v28, %%v17 \n\t"
"vfmadb %%v18, %%v22, %%v28, %%v18 \n\t"
"vfmadb %%v19, %%v23, %%v28, %%v19 \n\t"
"vl %%v30, 64(%[t1],%[y_tmp]) \n\t"
"vl %%v31, 80(%[t1],%[y_tmp]) \n\t"
"vl %%v6 , 96(%[t1],%[y_tmp]) \n\t"
"vl %%v7 , 112(%[t1],%[y_tmp]) \n\t"
"vfmadb %%v16, %%v24, %%v29, %%v16 \n\t"
"vfmadb %%v17, %%v25, %%v29, %%v17 \n\t"
"vfmadb %%v18, %%v26, %%v29, %%v18 \n\t"
"vfmadb %%v19, %%v27, %%v29, %%v19 \n\t"
"vl %%v20 , 64(%[t1],%[x_tmp]) \n\t"
"vl %%v21 , 80(%[t1],%[x_tmp]) \n\t"
"vl %%v22 , 96(%[t1],%[x_tmp]) \n\t"
"vl %%v23 ,112(%[t1],%[x_tmp]) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,0(%%r1,%2) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,32(%%r1,%2) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"vpdi %%v24,%%v16,%%v16,4 \n\t"
"vpdi %%v25,%%v17,%%v17,4 \n\t"
"vpdi %%v26,%%v18,%%v18,4 \n\t"
"vpdi %%v27,%%v19,%%v19,4 \n\t"
"vst %%v16 , 0(%[t1],%[y_tmp]) \n\t"
"vst %%v17 , 16(%[t1],%[y_tmp]) \n\t"
"vst %%v18 , 32(%[t1],%[y_tmp]) \n\t"
"vst %%v19 , 48(%[t1],%[y_tmp]) \n\t"
"la %[t1],64(%[t1] ) \n\t"
"2: \n\t"
"pfd 1, 256(%[t1],%[x_tmp]) \n\t"
"pfd 2, 256(%[t1],%[y_tmp]) \n\t"
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
"vfmadb %%v28,%%v16,%%v0,%%v20 \n\t"
"vfmadb %%v29,%%v17,%%v0,%%v21 \n\t"
"vfmadb %%v30,%%v18,%%v0,%%v22 \n\t"
"vfmadb %%v31,%%v19,%%v0,%%v23 \n\t"
"vfmadb %%v30, %%v20, %%v28, %%v30 \n\t"
"vfmadb %%v31, %%v21, %%v28, %%v31 \n\t"
"vfmadb %%v6, %%v22, %%v28, %%v6 \n\t"
"vfmadb %%v7, %%v23, %%v28, %%v7 \n\t"
"vl %%v16, 64(%[t1],%[y_tmp]) \n\t"
"vl %%v17, 80(%[t1],%[y_tmp]) \n\t"
"vl %%v18, 96(%[t1],%[y_tmp]) \n\t"
"vl %%v19, 112(%[t1],%[y_tmp]) \n\t"
"vfmadb %%v30, %%v24, %%v29, %%v30 \n\t"
"vfmadb %%v31, %%v25, %%v29, %%v31 \n\t"
"vfmadb %%v6, %%v26, %%v29, %%v6 \n\t"
"vfmadb %%v7, %%v27, %%v29, %%v7 \n\t"
"vfmadb %%v28,%%v24,%%v1,%%v28 \n\t"
"vfmadb %%v29,%%v25,%%v1,%%v29 \n\t"
"vfmadb %%v30,%%v26,%%v1,%%v30 \n\t"
"vfmadb %%v31,%%v27,%%v1,%%v31 \n\t"
"vl %%v20 , 64(%[t1],%[x_tmp]) \n\t"
"vl %%v21 , 80(%[t1],%[x_tmp]) \n\t"
"vl %%v22 , 96(%[t1],%[x_tmp]) \n\t"
"vl %%v23 ,112(%[t1],%[x_tmp]) \n\t"
"vst %%v28,0(%%r1,%2) \n\t"
"vst %%v29,16(%%r1,%2) \n\t"
"vst %%v30,32(%%r1,%2) \n\t"
"vst %%v31,48(%%r1,%2) \n\t"
"vst %%v30 , 0(%[t1],%[y_tmp]) \n\t"
"vst %%v31 , 16(%[t1],%[y_tmp]) \n\t"
"vst %%v6 , 32(%[t1],%[y_tmp]) \n\t"
"vst %%v7 , 48(%[t1],%[y_tmp]) \n\t"
"la %[t1],64(%[t1] ) \n\t"
"vl %%v16,64(%%r1,%1) \n\t"
"vl %%v17,80(%%r1,%1) \n\t"
"vl %%v18,96(%%r1,%1) \n\t"
"vl %%v19,112(%%r1,%1) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vpdi %%v24,%%v16,%%v16,4 \n\t"
"vpdi %%v25,%%v17,%%v17,4 \n\t"
"vpdi %%v26,%%v18,%%v18,4 \n\t"
"vpdi %%v27,%%v19,%%v19,4 \n\t"
"clgrjl %[t1],%[tmp],1b \n\t"
//----------------------------------------------------------------------
"vfmadb %%v16, %%v20, %%v28, %%v16 \n\t"
"vfmadb %%v17, %%v21, %%v28, %%v17 \n\t"
"vfmadb %%v18, %%v22, %%v28, %%v18 \n\t"
"vfmadb %%v19, %%v23, %%v28, %%v19 \n\t"
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
"vfmadb %%v16, %%v24, %%v29, %%v16 \n\t"
"vfmadb %%v17, %%v25, %%v29, %%v17 \n\t"
"vfmadb %%v18, %%v26, %%v29, %%v18 \n\t"
"vfmadb %%v19, %%v27, %%v29, %%v19 \n\t"
"vfmadb %%v28,%%v16,%%v0,%%v20 \n\t"
"vfmadb %%v29,%%v17,%%v0,%%v21 \n\t"
"vfmadb %%v30,%%v18,%%v0,%%v22 \n\t"
"vfmadb %%v31,%%v19,%%v0,%%v23 \n\t"
"vst %%v16 , 0(%[t1],%[y_tmp]) \n\t"
"vst %%v17 , 16(%[t1],%[y_tmp]) \n\t"
"vst %%v18 , 32(%[t1],%[y_tmp]) \n\t"
"vst %%v19 , 48(%[t1],%[y_tmp]) \n\t"
"vfmadb %%v28,%%v24,%%v1,%%v28 \n\t"
"vfmadb %%v29,%%v25,%%v1,%%v29 \n\t"
"vfmadb %%v30,%%v26,%%v1,%%v30 \n\t"
"vfmadb %%v31,%%v27,%%v1,%%v31 \n\t"
: [mem_y] "+m" (*(double (*)[2*n])y),[tmp]"+&r"(n) , [t1] "=&a" (tempR1)
: [mem_x] "m" (*(const double (*)[2*n])x), [x_tmp] "a"(x), [y_tmp] "a"(y), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i)
: "cc", "v6","v7", "v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"vst %%v28,64(%%r1,%2) \n\t"
"vst %%v29,80(%%r1,%2) \n\t"
"vst %%v30,96(%%r1,%2) \n\t"
"vst %%v31,112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT da[2];
if (n <= 0) return (0);
@ -166,8 +122,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
BLASLONG n1 = n & -8;
if (n1) {
zaxpy_kernel_8(n1, x, y, da_r,da_i);
if (n1) {
da[0] = da_r;
da[1] = da_i;
zaxpy_kernel_8(n1, x, y, da);
ix = 2 * n1;
}
i = n1;

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -24,71 +24,28 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__ volatile(
"pfd 1, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
"vl %%v28, 64(%%r1,%[ptr_x]) \n\t"
"vst %%v28, 64(%%r1,%[ptr_y]) \n\t"
"vl %%v29, 80(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 80(%%r1,%[ptr_y]) \n\t"
"vl %%v30, 96(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 96(%%r1,%[ptr_y]) \n\t"
"vl %%v31, 112(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 112(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %[n_tmp],1b"
: [mem_y] "=m" (*(double (*)[2*n])y), [n_tmp] "+&r"(n)
: [mem_x] "m" (*(const double (*)[2*n])x), [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "r1", "v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,4 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1) \n\t"
"pfd 2, 1024(%%r2) \n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t"
"agfi %%r1,256 \n\t"
"agfi %%r2,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","r2"
);
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
@ -137,9 +94,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
}
}
return(0);
return(0);
}

View File

@ -23,137 +23,92 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
*****************************************************************************/
#include "common.h"
#if defined(Z13)
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
{
__asm__ volatile(
"pfd 1, 0(%[ptr_x_tmp]) \n\t"
"pfd 1, 0(%[ptr_y_tmp]) \n\t"
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %[n_tmp],%[n_tmp],3 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t"
"pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t"
"vpdi %%v20,%%v16,%%v16,4 \n\t"
"vpdi %%v21,%%v17,%%v17,4 \n\t"
"vpdi %%v22,%%v18,%%v18,4 \n\t"
"vpdi %%v23,%%v19,%%v19,4 \n\t"
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"vzero %%v28 \n\t"
"vzero %%v29 \n\t"
"vzero %%v30 \n\t"
"vzero %%v31 \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vpdi %%v20,%%v16,%%v16,4 \n\t"
"vpdi %%v21,%%v17,%%v17,4 \n\t"
"vpdi %%v22,%%v18,%%v18,4 \n\t"
"vpdi %%v23,%%v19,%%v19,4 \n\t"
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
"vfmadb %%v25,%%v20,%%v28,%%v25 \n\t"
"vfmadb %%v26,%%v17,%%v29,%%v26 \n\t"
"vfmadb %%v27,%%v21,%%v29,%%v27 \n\t"
"vfmadb %%v24,%%v18,%%v30,%%v24 \n\t"
"vfmadb %%v25,%%v22,%%v30,%%v25 \n\t"
"vfmadb %%v26,%%v19,%%v31,%%v26 \n\t"
"vfmadb %%v27,%%v23,%%v31,%%v27 \n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t"
"vfmadb %%v25,%%v20,%%v0,%%v25 \n\t"
"vfmadb %%v26,%%v17,%%v1,%%v26 \n\t"
"vfmadb %%v27,%%v21,%%v1,%%v27 \n\t"
"vfmadb %%v28,%%v18,%%v2,%%v28 \n\t"
"vfmadb %%v29,%%v22,%%v2,%%v29 \n\t"
"vfmadb %%v30,%%v19,%%v3,%%v30 \n\t"
"vfmadb %%v31,%%v23,%%v3,%%v31 \n\t"
"vl %%v16, 64(%%r1,%1) \n\t"
"vl %%v17, 80(%%r1,%1) \n\t"
"vl %%v18, 96(%%r1,%1) \n\t"
"vl %%v19, 112(%%r1,%1) \n\t"
"vl %%v0, 64(%%r1,%2) \n\t"
"vl %%v1, 80(%%r1,%2) \n\t"
"vl %%v2, 96(%%r1,%2) \n\t"
"vl %%v3, 112(%%r1,%2) \n\t"
"vpdi %%v20,%%v16,%%v16,4 \n\t"
"vpdi %%v21,%%v17,%%v17,4 \n\t"
"vpdi %%v22,%%v18,%%v18,4 \n\t"
"vpdi %%v23,%%v19,%%v19,4 \n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t"
"vfmadb %%v25,%%v20,%%v0,%%v25 \n\t"
"vfmadb %%v26,%%v17,%%v1,%%v26 \n\t"
"vfmadb %%v27,%%v21,%%v1,%%v27 \n\t"
"vfmadb %%v28,%%v18,%%v2,%%v28 \n\t"
"vfmadb %%v29,%%v22,%%v2,%%v29 \n\t"
"vfmadb %%v30,%%v19,%%v3,%%v30 \n\t"
"vfmadb %%v31,%%v23,%%v3,%%v31 \n\t"
"vl %%v16, 64(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v19,112(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v31,112(%%r1,%[ptr_y_tmp]) \n\t"
"vpdi %%v20,%%v16,%%v16,4 \n\t"
"vpdi %%v21,%%v17,%%v17,4 \n\t"
"vpdi %%v22,%%v18,%%v18,4 \n\t"
"vpdi %%v23,%%v19,%%v19,4 \n\t"
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
"vfmadb %%v25,%%v20,%%v28,%%v25 \n\t"
"vfmadb %%v26,%%v17,%%v29,%%v26 \n\t"
"vfmadb %%v27,%%v21,%%v29,%%v27 \n\t"
"vfmadb %%v24,%%v18,%%v30,%%v24 \n\t"
"vfmadb %%v25,%%v22,%%v30,%%v25 \n\t"
"vfmadb %%v26,%%v19,%%v31,%%v26 \n\t"
"vfmadb %%v27,%%v23,%%v31,%%v27 \n\t"
"la %%r1,128(%%r1) \n\t"
"brctg %[n_tmp],1b \n\t"
"vfadb %%v24,%%v26,%%v24 \n\t"
"vfadb %%v25,%%v25,%%v27 \n\t"
"vsteg %%v24, 0(%[ptr_d]),0 \n\t"
"vsteg %%v24, 8(%[ptr_d]),1 \n\t"
"vsteg %%v25,16(%[ptr_d]),1 \n\t"
"vsteg %%v25,24(%[ptr_d]),0 \n\t"
: [mem_out] "=m"(*(double (*)[4])d ) ,[n_tmp] "+&r"(n)
: [mem_x] "m"( *(const double (*)[2*n])x),
[mem_y] "m"( *(const double (*)[2*n])y),
[ptr_x_tmp] "a"(x), [ptr_y_tmp] "a"(y), [ptr_d] "a"(d)
: "cc", "r1","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"vfadb %%v24,%%v24,%%v26 \n\t"
"vfadb %%v24,%%v24,%%v28 \n\t"
"vfadb %%v24,%%v24,%%v30 \n\t"
"vfadb %%v25,%%v25,%%v27 \n\t"
"vfadb %%v25,%%v25,%%v29 \n\t"
"vfadb %%v25,%%v25,%%v31 \n\t"
"vsteg %%v24,0(%3),0 \n\t"
"vsteg %%v24,8(%3),1 \n\t"
"vsteg %%v25,16(%3),1 \n\t"
"vsteg %%v25,24(%3),0 "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
#else
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
BLASLONG register i = 0;
FLOAT dot[4] = {0.0, 0.0, 0.0, 0.0};
BLASLONG j = 0;
while (i < n) {
dot[0] += x[j] * y[j];
dot[1] += x[j + 1] * y[j + 1];
dot[2] += x[j] * y[j + 1];
dot[3] += x[j + 1] * y[j];
dot[0] += x[j + 2] * y[j + 2];
dot[1] += x[j + 3] * y[j + 3];
dot[2] += x[j + 2] * y[j + 3];
dot[3] += x[j + 3] * y[j + 2];
dot[0] += x[j + 4] * y[j + 4];
dot[1] += x[j + 5] * y[j + 5];
dot[2] += x[j + 4] * y[j + 5];
dot[3] += x[j + 5] * y[j + 4];
dot[0] += x[j + 6] * y[j + 6];
dot[1] += x[j + 7] * y[j + 7];
dot[2] += x[j + 6] * y[j + 7];
dot[3] += x[j + 7] * y[j + 6];
j += 8;
i += 4;
}
d[0] = dot[0];
d[1] = dot[1];
d[2] = dot[2];
d[3] = dot[3];
}
#endif
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
BLASLONG ix=0, iy=0;
BLASLONG i;
BLASLONG ix, iy;
OPENBLAS_COMPLEX_FLOAT result;
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
@ -167,14 +122,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -8;
BLASLONG j=0;
if (n1){
if (n1)
zdot_kernel_8(n1, x, y, dot);
i = n1;
j = n1 <<1;
}
i = n1;
BLASLONG j = i * 2;
while (i < n) {

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,176 +27,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA)
static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
__asm__ (
"pfd 2, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"lgdr %%r1,%[cos] \n\t"
"vlvgp %%v0,%%r1,%%r1 \n\t"
"lgdr %%r1,%[sin] \n\t"
"vlvgp %%v1,%%r1,%%r1 \n\t"
"sllg %[tmp],%[tmp],4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 16(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 32(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 48(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v27,112(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 64(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 80(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 96(%%r1,%[ptr_y]) \n\t"
"vl %%v19,112(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 80(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 96(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 112(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 144(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 160(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 176(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 192(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 208(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 224(%%r1,%[ptr_y]) \n\t"
"vl %%v19, 240(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 240(%%r1,%[ptr_y]) \n\t"
"la %%r1,256(%%r1) \n\t"
"clgrjl %%r1,%[tmp],1b \n\t"
: [mem_x] "+m" (*(double (*)[2*n])x),
[mem_y] "+m" (*(double (*)[2*n])y),
[tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA)
: "cc","r1" ,"v0","v1","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
__asm__ (
"vlrepg %%v0,%3 \n\t"
"vlrepg %%v1,%4 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%1) \n\t"
"vst %%v29, 16(%%r1,%1) \n\t"
"vst %%v30, 32(%%r1,%1) \n\t"
"vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v23, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%1) \n\t"
"vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%1) \n\t"
"vst %%v29, 144(%%r1,%1) \n\t"
"vst %%v30, 160(%%r1,%1) \n\t"
"vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v23, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
@ -214,8 +204,11 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
zrot_kernel_16(n1, x, y, c, s);
{
FLOAT cosa,sina;
cosa=c;
sina=s;
zrot_kernel_16(n1, x, y, &cosa, &sina);
i=n1;
ix=2*n1;
}
@ -234,6 +227,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
}
}
else
{
@ -259,3 +253,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
}

View File

@ -23,270 +23,211 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
*****************************************************************************/
#include "common.h"
static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlrepg %%v0,0(%1) \n\t"
"vleg %%v1,8(%1),0 \n\t"
"wflcdb %%v1,%%v1 \n\t"
"vleg %%v1,8(%1),1 \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vpdi %%v24,%%v16,%%v16,4 \n\t"
"vpdi %%v25,%%v17,%%v17,4 \n\t"
"vpdi %%v26,%%v18,%%v18,4 \n\t"
"vpdi %%v27,%%v19,%%v19,4 \n\t"
"vpdi %%v28,%%v20,%%v20,4 \n\t"
"vpdi %%v29,%%v21,%%v21,4 \n\t"
"vpdi %%v30,%%v22,%%v22,4 \n\t"
"vpdi %%v31,%%v23,%%v23,4 \n\t"
"vfmdb %%v16,%%v16,%%v0 \n\t"
"vfmdb %%v17,%%v17,%%v0 \n\t"
"vfmdb %%v18,%%v18,%%v0 \n\t"
"vfmdb %%v19,%%v19,%%v0 \n\t"
"vfmdb %%v20,%%v20,%%v0 \n\t"
"vfmdb %%v21,%%v21,%%v0 \n\t"
"vfmdb %%v22,%%v22,%%v0 \n\t"
"vfmdb %%v23,%%v23,%%v0 \n\t"
"vfmadb %%v16,%%v24,%%v1,%%v16 \n\t"
"vfmadb %%v17,%%v25,%%v1,%%v17 \n\t"
"vfmadb %%v18,%%v26,%%v1,%%v18 \n\t"
"vfmadb %%v19,%%v27,%%v1,%%v19 \n\t"
"vfmadb %%v20,%%v28,%%v1,%%v20 \n\t"
"vfmadb %%v21,%%v29,%%v1,%%v21 \n\t"
"vfmadb %%v22,%%v30,%%v1,%%v22 \n\t"
"vfmadb %%v23,%%v31,%%v1,%%v23 \n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vleg %%v0,8(%1),0 \n\t"
"wflcdb %%v0,%%v0 \n\t"
"vleg %%v0,8(%1),1 \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
static void zscal_kernel_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x) {
BLASLONG tempR1 ;
__asm__ (
"pfd 2, 0(%[x_tmp]) \n\t"
#if !defined(CONJ)
"lgdr %[t1],%[alpha_r] \n\t"
"vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint
"lgdr %[t1],%[alpha_i] \n\t"
"vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint
"vflcdb %%v29,%%v29 \n\t" //complement both
"vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i}
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vpdi %%v16,%%v16,%%v16,4 \n\t"
"vpdi %%v17,%%v17,%%v17,4 \n\t"
"vpdi %%v18,%%v18,%%v18,4 \n\t"
"vpdi %%v19,%%v19,%%v19,4 \n\t"
"vpdi %%v20,%%v20,%%v20,4 \n\t"
"vpdi %%v21,%%v21,%%v21,4 \n\t"
"vpdi %%v22,%%v22,%%v22,4 \n\t"
"vpdi %%v23,%%v23,%%v23,4 \n\t"
#else
"lgdr %[t1],%[alpha_i] \n\t"
"vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint
"lgdr %[t1],%[alpha_r] \n\t"
"vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint
"vflcdb %%v28,%%v28 \n\t" //complement both
"vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r}
#endif
"xgr %[t1],%[t1] \n\t"
"sllg %[tmp],%[tmp],4 \n\t"
"vl %%v20 , 0(%[t1],%[x_tmp]) \n\t"
"vl %%v21 , 16(%[t1],%[x_tmp]) \n\t"
"vl %%v22 , 32(%[t1],%[x_tmp]) \n\t"
"vl %%v23 , 48(%[t1],%[x_tmp]) \n\t"
"lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition
"j 2f \n\t"
".align 16 \n\t"
"1: \n\t"
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
"vfmdb %%v16, %%v20, %%v28 \n\t"
"vfmdb %%v17, %%v21, %%v28 \n\t"
"vfmdb %%v18, %%v22, %%v28 \n\t"
"vfmdb %%v19, %%v23, %%v28 \n\t"
"vl %%v20, 64(%[t1],%[x_tmp]) \n\t"
"vl %%v21, 80(%[t1],%[x_tmp]) \n\t"
"vl %%v22, 96(%[t1],%[x_tmp]) \n\t"
"vl %%v23, 112(%[t1],%[x_tmp]) \n\t"
"vfmadb %%v16, %%v24, %%v29, %%v16 \n\t"
"vfmadb %%v17, %%v25, %%v29, %%v17 \n\t"
"vfmadb %%v18, %%v26, %%v29, %%v18 \n\t"
"vfmadb %%v19, %%v27, %%v29, %%v19 \n\t"
"vfmdb %%v16,%%v16,%%v0 \n\t"
"vfmdb %%v17,%%v17,%%v0 \n\t"
"vfmdb %%v18,%%v18,%%v0 \n\t"
"vfmdb %%v19,%%v19,%%v0 \n\t"
"vfmdb %%v20,%%v20,%%v0 \n\t"
"vfmdb %%v21,%%v21,%%v0 \n\t"
"vfmdb %%v22,%%v22,%%v0 \n\t"
"vfmdb %%v23,%%v23,%%v0 \n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"vst %%v16 , 0(%[t1],%[x_tmp]) \n\t"
"vst %%v17 , 16(%[t1],%[x_tmp]) \n\t"
"vst %%v18 , 32(%[t1],%[x_tmp]) \n\t"
"vst %%v19 , 48(%[t1],%[x_tmp]) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
}
static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlrepg %%v0,0(%1) \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfmdb %%v16,%%v16,%%v0 \n\t"
"vfmdb %%v17,%%v17,%%v0 \n\t"
"vfmdb %%v18,%%v18,%%v0 \n\t"
"vfmdb %%v19,%%v19,%%v0 \n\t"
"vfmdb %%v20,%%v20,%%v0 \n\t"
"vfmdb %%v21,%%v21,%%v0 \n\t"
"vfmdb %%v22,%%v22,%%v0 \n\t"
"vfmdb %%v23,%%v23,%%v0 \n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
}
static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"vst %%v24,0(%%r1,%1) \n\t"
"vst %%v25,16(%%r1,%1) \n\t"
"vst %%v26,32(%%r1,%1) \n\t"
"vst %%v27,48(%%r1,%1) \n\t"
"vst %%v24,64(%%r1,%1) \n\t"
"vst %%v25,80(%%r1,%1) \n\t"
"vst %%v26,96(%%r1,%1) \n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"la %[t1],64(%[t1] ) \n\t"
"2: \n\t"
"pfd 2, 256(%[t1],%[x_tmp]) \n\t"
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
"vfmdb %%v30, %%v20, %%v28 \n\t"
"vfmdb %%v31, %%v21, %%v28 \n\t"
"vfmdb %%v6, %%v22, %%v28 \n\t"
"vfmdb %%v7, %%v23, %%v28 \n\t"
"vl %%v20 , 64(%[t1],%[x_tmp]) \n\t"
"vl %%v21 , 80(%[t1],%[x_tmp]) \n\t"
"vl %%v22 , 96(%[t1],%[x_tmp]) \n\t"
"vl %%v23 ,112(%[t1],%[x_tmp]) \n\t"
"vfmadb %%v30, %%v24, %%v29, %%v30 \n\t"
"vfmadb %%v31, %%v25, %%v29, %%v31 \n\t"
"vfmadb %%v6, %%v26, %%v29, %%v6 \n\t"
"vfmadb %%v7, %%v27, %%v29, %%v7 \n\t"
"vst %%v30 , 0(%[t1],%[x_tmp]) \n\t"
"vst %%v31 , 16(%[t1],%[x_tmp]) \n\t"
"vst %%v6 , 32(%[t1],%[x_tmp]) \n\t"
"vst %%v7 , 48(%[t1],%[x_tmp]) \n\t"
"la %[t1],64(%[t1] ) \n\t"
"clgrjl %[t1],%[tmp],1b \n\t"
//----------------------------------------------------------------------
"vfmdb %%v16, %%v20, %%v28 \n\t"
"vfmdb %%v17, %%v21, %%v28 \n\t"
"vfmdb %%v18, %%v22, %%v28 \n\t"
"vfmdb %%v19, %%v23, %%v28 \n\t"
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
"vfmadb %%v16, %%v24, %%v29, %%v16 \n\t"
"vfmadb %%v17, %%v25, %%v29, %%v17 \n\t"
"vfmadb %%v18, %%v26, %%v29, %%v18 \n\t"
"vfmadb %%v19, %%v27, %%v29, %%v19 \n\t"
"vst %%v16 , 0(%[t1],%[x_tmp]) \n\t"
"vst %%v17 , 16(%[t1],%[x_tmp]) \n\t"
"vst %%v18 , 32(%[t1],%[x_tmp]) \n\t"
"vst %%v19 , 48(%[t1],%[x_tmp]) \n\t"
: [mem_x] "+m" (*(double (*)[2*n])x),[tmp]"+&r"(n) , [t1] "=&a" (tempR1)
: [x_tmp] "a"(x), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i)
: "cc", "v6","v7", "v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT da_i, FLOAT *x) {
__asm__ ( "pfd 2, 0(%1) \n\t"
"lgdr %%r0,%[alpha] \n\t"
"vlvgp %%v16,%%r0,%%r0 \n\t" //load both from disjoint
"vflcdb %%v16,%%v16 \n\t" //complement both
"vlvgg %%v16,%%r0,0 \n\t" //restore 1st
"vlr %%v17 ,%%v16 \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[x_ptr] \n\t"
".align 16 \n\t"
"1: \n\t"
"vl %%v24, 0(%[x_ptr]) \n\t"
"vfmdb %%v24,%%v24,%%v16 \n\t"
"vsteg %%v24, 0(%[x_ptr]),1 \n\t"
"vsteg %%v24, 8(%[x_ptr]),0 \n\t"
"vl %%v25, 16(%[x_ptr]) \n\t"
"vfmdb %%v25,%%v25,%%v17 \n\t"
"vsteg %%v25, 16(%[x_ptr]),1 \n\t"
"vsteg %%v25, 24(%[x_ptr]),0 \n\t"
"vl %%v26, 32(%[x_ptr]) \n\t"
"vfmdb %%v26,%%v26,%%v16 \n\t"
"vsteg %%v26, 32(%[x_ptr]),1 \n\t"
"vsteg %%v26, 40(%[x_ptr]),0 \n\t"
"vl %%v27, 48(%[x_ptr]) \n\t"
"vfmdb %%v27,%%v27,%%v17 \n\t"
"vsteg %%v27, 48(%[x_ptr]),1 \n\t"
"vsteg %%v27, 56(%[x_ptr]),0 \n\t"
"vl %%v28, 64(%[x_ptr]) \n\t"
"vfmdb %%v28,%%v28,%%v16 \n\t"
"vsteg %%v28, 64(%[x_ptr]),1 \n\t"
"vsteg %%v28, 72(%[x_ptr]),0 \n\t"
"vl %%v29, 80(%[x_ptr]) \n\t"
"vfmdb %%v29,%%v29,%%v17 \n\t"
"vsteg %%v29, 80(%[x_ptr]),1 \n\t"
"vsteg %%v29, 88(%[x_ptr]),0 \n\t"
"vl %%v30, 96(%[x_ptr]) \n\t"
"vfmdb %%v30,%%v30,%%v16 \n\t"
"vsteg %%v30, 96(%[x_ptr]),1 \n\t"
"vsteg %%v30, 104(%[x_ptr]),0 \n\t"
"vl %%v31, 112(%[x_ptr]) \n\t"
"vfmdb %%v31,%%v31,%%v17 \n\t"
"vsteg %%v31, 112(%[x_ptr]),1 \n\t"
"vsteg %%v31, 120(%[x_ptr]),0 \n\t"
"la %[x_ptr],128(%[x_ptr]) \n\t"
"clgrjl %[x_ptr],%%r0,1b \n\t"
: [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x)
: [n] "r"(n),[alpha] "f"(da_i)
:"cc", "r0","f0", "f1","v16","v17" ,"v24","v25","v26","v27","v28","v29","v30","v31"
);
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v24","v25","v26","v27"
);
}
static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT da_r, FLOAT *x) {
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
"lgdr %%r0,%[alpha] \n\t"
"vlvgp %%v18,%%r0,%%r0 \n\t"
"vlr %%v19,%%v18 \n\t"
"vlr %%v16,%%v18 \n\t"
"vlr %%v17,%%v18 \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[x_ptr] \n\t"
".align 16 \n\t"
"1: \n\t"
"vl %%v24, 0(%[x_ptr]) \n\t"
"vfmdb %%v24,%%v24,%%v18 \n\t"
"vst %%v24, 0(%[x_ptr]) \n\t"
"vl %%v25, 16(%[x_ptr]) \n\t"
"vfmdb %%v25,%%v25,%%v19 \n\t"
"vst %%v25, 16(%[x_ptr]) \n\t"
"vl %%v26, 32(%[x_ptr]) \n\t"
"vfmdb %%v26,%%v26,%%v16 \n\t"
"vst %%v26, 32(%[x_ptr]) \n\t"
"vl %%v27, 48(%[x_ptr]) \n\t"
"vfmdb %%v27,%%v27,%%v17 \n\t"
"vst %%v27, 48(%[x_ptr]) \n\t"
"vl %%v28, 64(%[x_ptr]) \n\t"
"vfmdb %%v28,%%v28,%%v18 \n\t"
"vst %%v28, 64(%[x_ptr]) \n\t"
"vl %%v29, 80(%[x_ptr]) \n\t"
"vfmdb %%v29,%%v29,%%v19 \n\t"
"vst %%v29, 80(%[x_ptr]) \n\t"
"vl %%v30, 96(%[x_ptr]) \n\t"
"vfmdb %%v30,%%v30,%%v16 \n\t"
"vst %%v30, 96(%[x_ptr]) \n\t"
"vl %%v31,112(%[x_ptr]) \n\t"
"vfmdb %%v31,%%v31,%%v17 \n\t"
"vst %%v31,112(%[x_ptr]) \n\t"
"la %[x_ptr],128(%[x_ptr]) \n\t"
"clgrjl %[x_ptr],%%r0,1b \n\t"
: [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x)
: [n] "r"(n),[alpha] "f"(da_r)
: "cc", "r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) {
__asm__ ( "pfd 2, 0(%[x_ptr]) \n\t"
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[x_ptr] \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256( %[x_ptr]) \n\t"
"vst %%v24, 0( %[x_ptr]) \n\t"
"vst %%v25, 16( %[x_ptr]) \n\t"
"vst %%v26, 32( %[x_ptr]) \n\t"
"vst %%v27, 48( %[x_ptr]) \n\t"
"vst %%v24, 64( %[x_ptr]) \n\t"
"vst %%v25, 80( %[x_ptr]) \n\t"
"vst %%v26, 96( %[x_ptr]) \n\t"
"vst %%v27,112( %[x_ptr]) \n\t"
"la %[x_ptr],128(%[x_ptr]) \n\t"
"clgrjl %[x_ptr],%%r0,1b \n\t"
: [mem] "+m" (*(double (*)[2*n])x),[x_ptr] "+&a"(x)
: [n] "r"(n)
:"cc" ,"r0","v24","v25","v26","v27"
);
}
static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x) {
static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i;
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_x3 = inc_x2 + inc_x;
FLOAT t0, t1, t2, t3;
FLOAT t0, t1, t2, t3;
FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1];
for (i = 0; i < n; i += 4) {
for (i = 0; i < n; i += 4)
{
t0 = da_r * x[0] - da_i * x[1];
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
@ -303,17 +244,14 @@ static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLAS
x[inc_x3] = t3;
x += 4 * inc_x;
}
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0, j = 0;
FLOAT temp0;
FLOAT temp1;
FLOAT alpha[2] __attribute__ ((aligned(16)));
if (inc_x != 1) {
inc_x <<= 1;
@ -405,8 +343,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
} else {
BLASLONG n1 = n & -8;
if (n1 > 0) {
zscal_kernel_inc_8(n1, da_r,da_i, x, inc_x);
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
zscal_kernel_inc_8(n1, alpha, x, inc_x);
j = n1;
i = n1 * inc_x;
}
@ -432,17 +372,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
BLASLONG n1 = n & -8;
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
if (da_r == 0.0)
if (da_i == 0)
zscal_kernel_8_zero(n1, x);
else
zscal_kernel_8_zero_r(n1, da_i, x);
zscal_kernel_8_zero_r(n1, alpha, x);
else
if (da_i == 0)
zscal_kernel_8_zero_i(n1, da_r, x);
zscal_kernel_8_zero_i(n1, alpha, x);
else
zscal_kernel_8(n1, da_r,da_i, x);
zscal_kernel_8(n1, alpha, x);
i = n1 << 1;
j = n1;
@ -508,5 +450,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
return (0);
}

View File

@ -25,220 +25,93 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(Z13_SWAP_A)
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"pfd 1, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v16, 0(%%r1,%[ptr_x]) \n\t"
__asm__ volatile(
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v17, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v18, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
"vst %%v19, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v28, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v20, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v20, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v29, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v21, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v30, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v31, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v16, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v17, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v18, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v19, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v20, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v20, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v21, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 240(%%r1,%[ptr_x]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %[n_tmp],1b"
: [mem_x] "+m" (*(double (*)[2*n])x),
[mem_y] "+m" (*(double (*)[2*n])y),
[n_tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23"
,"v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
"vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v1, 144(%%r1,%2) \n\t"
"vl %%v2, 160(%%r1,%2) \n\t"
"vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v6, 224(%%r1,%2) \n\t"
"vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vst %%v2, 160(%%r1,%1) \n\t"
"vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vst %%v7, 240(%%r1,%1) \n\t"
"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
#else
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"pfd 2, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_x]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v20, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v21, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v0, 0(%%r1,%[ptr_y]) \n\t"
"vl %%v1, 16(%%r1,%[ptr_y]) \n\t"
"vl %%v2, 32(%%r1,%[ptr_y]) \n\t"
"vl %%v3, 48(%%r1,%[ptr_y]) \n\t"
"vl %%v4, 64(%%r1,%[ptr_y]) \n\t"
"vl %%v5, 80(%%r1,%[ptr_y]) \n\t"
"vl %%v6, 96(%%r1,%[ptr_y]) \n\t"
"vl %%v7, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v0, 0(%%r1,%[ptr_x]) \n\t"
"vst %%v1, 16(%%r1,%[ptr_x]) \n\t"
"vst %%v2, 32(%%r1,%[ptr_x]) \n\t"
"vst %%v3, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v4, 64(%%r1,%[ptr_x]) \n\t"
"vst %%v5, 80(%%r1,%[ptr_x]) \n\t"
"vst %%v6, 96(%%r1,%[ptr_x]) \n\t"
"vst %%v7, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v0, 128(%%r1,%[ptr_y]) \n\t"
"vl %%v1, 144(%%r1,%[ptr_y]) \n\t"
"vl %%v2, 160(%%r1,%[ptr_y]) \n\t"
"vl %%v3, 176(%%r1,%[ptr_y]) \n\t"
"vl %%v4, 192(%%r1,%[ptr_y]) \n\t"
"vl %%v5, 208(%%r1,%[ptr_y]) \n\t"
"vl %%v6, 224(%%r1,%[ptr_y]) \n\t"
"vl %%v7, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v0, 128(%%r1,%[ptr_x]) \n\t"
"vst %%v1, 144(%%r1,%[ptr_x]) \n\t"
"vst %%v2, 160(%%r1,%[ptr_x]) \n\t"
"vst %%v3, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v4, 192(%%r1,%[ptr_x]) \n\t"
"vst %%v5, 208(%%r1,%[ptr_x]) \n\t"
"vst %%v6, 224(%%r1,%[ptr_x]) \n\t"
"vst %%v7, 240(%%r1,%[ptr_x]) \n\t"
"vst %%v16, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v17, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v18, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v19, 48(%%r1,%[ptr_y]) \n\t"
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %[n_tmp],1b"
: [mem_x] "+m" (*(double (*)[2*n])x),
[mem_y] "+m" (*(double (*)[2*n])y),
[n_tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;

437
ztest/Makefile Normal file
View File

@ -0,0 +1,437 @@
TOPDIR = ..
include $(TOPDIR)/Makefile.system
goto :: sdot.goto ddot.goto cdot.goto zdot.goto dsdot.goto sswap.goto dswap.goto cswap.goto zswap.goto isamax.goto idamax.goto icamax.goto izamax.goto samax.goto damax.goto ismax.goto idmax.goto smax.goto dmax.goto isamin.goto idamin.goto icamin.goto izamin.goto samin.goto damin.goto camin.goto zamin.goto ismin.goto idmin.goto smin.goto dmin.goto sgemv.goto dgemv.goto cgemv.goto zgemv.goto sscal.goto dscal.goto cscal.goto zscal.goto saxpy.goto daxpy.goto caxpy.goto zaxpy.goto srot.goto drot.goto crot.goto zrot.goto sasum.goto dasum.goto casum.goto zasum.goto scopy.goto dcopy.goto ccopy.goto zcopy.goto
##################################### Sdot ####################################################
sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Ddot ####################################################
ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Cdot ####################################################
cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Zdot ####################################################
zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Dsdot ####################################################
dsdot.goto : dsdot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
############################################## ISAMAX ##############################################
isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
############################################## IDAMAX ##############################################
idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
############################################## ICAMAX ##############################################
icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
############################################## IZAMAX ##############################################
izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
############################################## SAMAX ##############################################
samax.goto : samax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
############################################## DAMAX ##############################################
damax.goto : damax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
############################################## ISMAX ##############################################
ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
############################################## IDMAX ##############################################
idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
############################################## SMAX ##############################################
smax.goto : smax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
############################################## DMAX ##############################################
dmax.goto : dmax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
############################################## ISAMIN ##############################################
isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
############################################## IDAMIN ##############################################
idamin.goto : idamin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
############################################## ICAMIN ##############################################
icamin.goto : icamin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
############################################## IZAMIN ##############################################
izamin.goto : izamin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
############################################## SAMIN ##############################################
samin.goto : samin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
############################################## DAMIN ##############################################
damin.goto : damin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
############################################## CAMIN ##############################################
camin.goto : camin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
############################################## ZAMIN ##############################################
zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
############################################## ISMIN ##############################################
ismin.goto : ismin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
############################################## IDMIN ##############################################
idmin.goto : idmin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
############################################## SMIN ##############################################
smin.goto : smin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
############################################## DMIN ##############################################
dmin.goto : dmin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Sgemv ####################################################
sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Dgemv ####################################################
dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Cgemv ####################################################
cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Zgemv ####################################################
zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Sscal ####################################################
sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Dscal ####################################################
dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Cscal ####################################################
cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Zscal ####################################################
zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Saxpy ####################################################
saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Daxpy ####################################################
daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Caxpy ####################################################
caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Zaxpy ####################################################
zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Srot ####################################################
srot.goto : srot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Drot ####################################################
drot.goto : drot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Crot ####################################################
crot.goto : crot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Zrot ####################################################
zrot.goto : zrot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Sswap ####################################################
sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Dswap ####################################################
dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Cswap ####################################################
cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Zswap ####################################################
zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Saxpy ####################################################
saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Daxpy ####################################################
daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Caxpy ####################################################
caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Zaxpy ####################################################
zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Sasum ####################################################
sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Dasum ####################################################
dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Casum ####################################################
casum.goto : casum.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Zasum ####################################################
zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Scopy ####################################################
scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Dcopy ####################################################
dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Ccopy ####################################################
ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
##################################### Zcopy ####################################################
zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
###################################################################################################
sdot.$(SUFFIX) : dot.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
ddot.$(SUFFIX) : dot.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
cdot.$(SUFFIX) : dot.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zdot.$(SUFFIX) : dot.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
dsdot.$(SUFFIX) : dsdot.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
isamax.$(SUFFIX) : iamax.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
idamax.$(SUFFIX) : iamax.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
icamax.$(SUFFIX) : iamax.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
izamax.$(SUFFIX) : iamax.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
samax.$(SUFFIX) : amax.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
damax.$(SUFFIX) : amax.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
ismax.$(SUFFIX) : imax.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
idmax.$(SUFFIX) : imax.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
smax.$(SUFFIX) : max.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
dmax.$(SUFFIX) : max.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
isamin.$(SUFFIX) : iamin.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
idamin.$(SUFFIX) : iamin.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
icamin.$(SUFFIX) : iamin.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
izamin.$(SUFFIX) : iamin.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
samin.$(SUFFIX) : amin.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
damin.$(SUFFIX) : amin.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
camin.$(SUFFIX) : amin.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zamin.$(SUFFIX) : amin.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
ismin.$(SUFFIX) : imin.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
idmin.$(SUFFIX) : imin.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
smin.$(SUFFIX) : min.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
dmin.$(SUFFIX) : min.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
sgemv.$(SUFFIX) : gemv.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
dgemv.$(SUFFIX) : gemv.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
cgemv.$(SUFFIX) : gemv.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zgemv.$(SUFFIX) : gemv.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
sscal.$(SUFFIX) : scal.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
dscal.$(SUFFIX) : scal.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
cscal.$(SUFFIX) : scal.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zscal.$(SUFFIX) : scal.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
saxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
daxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
caxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zaxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
srot.$(SUFFIX) : rot.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
drot.$(SUFFIX) : rot.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
crot.$(SUFFIX) : rot.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zrot.$(SUFFIX) : rot.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
sswap.$(SUFFIX) : swap.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
dswap.$(SUFFIX) : swap.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
cswap.$(SUFFIX) : swap.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zswap.$(SUFFIX) : swap.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
saxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
daxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
caxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zaxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
sasum.$(SUFFIX) : asum.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
dasum.$(SUFFIX) : asum.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
casum.$(SUFFIX) : asum.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zasum.$(SUFFIX) : asum.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
scopy.$(SUFFIX) : copy.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
dcopy.$(SUFFIX) : copy.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
ccopy.$(SUFFIX) : copy.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zcopy.$(SUFFIX) : copy.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
clean ::
@rm -f *.goto

235
ztest/amax.c Normal file
View File

@ -0,0 +1,235 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13
int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
FLOAT amax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf=0.0;
if (n <= 0 || inc_x <= 0) return(maxf);
maxf=ABS(x[0]);
ix += inc_x;
i++;
while(i < n)
{
if( ABS(x[ix]) > maxf )
{
maxf = ABS(x[ix]);
}
ix += inc_x;
i++;
}
return(maxf);
}
#undef AMAX
#ifdef DOUBLE
#define AMAX BLASFUNC(damax)
#else
#define AMAX BLASFUNC(samax)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
FLOAT result, result_c;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg,timeg_c;
int test = 1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops Time CTime Test\n");
for(m = from; m <= to; m += step)
{
timeg=0;
timeg_c=0;
fprintf(stderr, " %6d :", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
result = AMAX (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
gettimeofday( &start, (struct timezone *)0);
result_c = amax_c(m, x, inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;
test &= assert_dbl_near(result, result_c, SINGLE_EPS);
}
timeg /= loops;
timeg_c /= loops;
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

235
ztest/amin.c Normal file
View File

@ -0,0 +1,235 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13
int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
FLOAT amin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf=0.0;
if (n <= 0 || inc_x <= 0) return(minf);
minf=ABS(x[0]);
ix += inc_x;
i++;
while(i < n)
{
if( ABS(x[ix]) < minf )
{
minf = ABS(x[ix]);
}
ix += inc_x;
i++;
}
return(minf);
}
#undef AMIN
#ifdef DOUBLE
#define AMIN BLASFUNC(damin)
#else
#define AMIN BLASFUNC(samin)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
FLOAT result, result_c;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg,timeg_c;
int test = 1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops Time CTime Test\n");
for(m = from; m <= to; m += step)
{
timeg=0;
timeg_c=0;
fprintf(stderr, " %6d :", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
result = AMIN (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
gettimeofday( &start, (struct timezone *)0);
result_c = amin_c(m, x, inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;
test &= assert_dbl_near(result, result_c, SINGLE_EPS);
}
timeg /= loops;
timeg_c /= loops;
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

263
ztest/asum.c Normal file
View File

@ -0,0 +1,263 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13
int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#ifdef COMPLEX
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
FLOAT zasum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(sumf);
inc_x2 = 2 * inc_x;
n *= inc_x2;
while(i < n)
{
sumf += CABS1(x,i);
i += inc_x2;
}
return(sumf);
}
#else
FLOAT asum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
if (n <= 0 || inc_x <= 0) return(sumf);
n *= inc_x;
while(i < n)
{
sumf += ABS(x[i]);
i += inc_x;
}
return(sumf);
}
#endif
#undef ASUM
#ifdef COMPLEX
#ifdef DOUBLE
#define ASUM BLASFUNC(dzasum)
#else
#define ASUM BLASFUNC(scasum)
#endif
#else
#ifdef DOUBLE
#define ASUM BLASFUNC(dasum)
#else
#define ASUM BLASFUNC(sasum)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
FLOAT result, result_c;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg,timeg_c;
int test = 1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops Time CTime Test\n");
for(m = from; m <= to; m += step)
{
timeg=0;
timeg_c=0;
fprintf(stderr, " %6d :", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
result = ASUM (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
gettimeofday( &start, (struct timezone *)0);
#ifdef COMPLEX
result_c = zasum_c(m, x, inc_x);
#else
result_c = asum_c(m, x, inc_x);
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;
test &= assert_dbl_near(result, result_c, SINGLE_EPS);
}
timeg /= loops;
timeg_c /= loops;
#ifdef COMPLEX
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 4. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#else
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#endif
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

303
ztest/axpy.c Normal file
View File

@ -0,0 +1,303 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13
int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}
#ifdef COMPLEX
int zaxpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix,iy;
BLASLONG inc_x2;
BLASLONG inc_y2;
if ( n < 0 ) return(0);
if ( da_r == 0.0 && da_i == 0.0 ) return(0);
ix = 0;
iy = 0;
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
while(i < n)
{
#if !defined(CONJ)
y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
#else
y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
#endif
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
return(0);
}
#else
int axpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix,iy;
if ( n < 0 ) return(0);
if ( da == 0.0 ) return(0);
ix = 0;
iy = 0;
while(i < n)
{
y[iy] += da * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(0);
}
#endif
#undef AXPY
#ifdef COMPLEX
#ifdef DOUBLE
#define AXPY BLASFUNC(zaxpy)
#else
#define AXPY BLASFUNC(caxpy)
#endif
#else
#ifdef DOUBLE
#define AXPY BLASFUNC(daxpy)
#else
#define AXPY BLASFUNC(saxpy)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y, *y_c;;
FLOAT alpha[2] = { 2.0, 2.0 };
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg,timeg_c;
argc--;argv++;
blasint iy;
int test = 1;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops Time CTime Test\n");
for(m = from; m <= to; m += step)
{
timeg=0;
timeg_c=0;
fprintf(stderr, " %6d :", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
y_c[i] = y[i];
}
gettimeofday( &start, (struct timezone *)0);
AXPY (&m, alpha, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
gettimeofday( &start, (struct timezone *)0);
#ifdef COMPLEX
zaxpy_c(m, 0, 0, alpha[0], alpha[1], x, inc_x, y_c, inc_y, NULL, 0);
#else
axpy_c(m, 0, 0, *alpha, x, inc_x, y_c, inc_y, NULL, 0);
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;
iy = 0;
#ifdef COMPLEX
for (i = 0; i < m * 2; i++)
#else
for (i = 0; i < m; i++)
#endif
{
test &= assert_dbl_near(y[iy], y_c[iy], SINGLE_EPS);
iy += inc_y;
}
}
timeg /= loops;
timeg_c /= loops;
#ifdef COMPLEX
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#else
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#endif
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

291
ztest/copy.c Normal file
View File

@ -0,0 +1,291 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13
int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}
#ifdef COMPLEX
int zcopy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
BLASLONG inc_x2;
BLASLONG inc_y2;
if ( n < 0 ) return(0);
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
while(i < n)
{
y[iy] = x[ix] ;
y[iy+1] = x[ix+1] ;
ix += inc_x2;
iy += inc_y2;
i++ ;
}
return(0);
}
#else
int copy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n < 0 ) return(0);
while(i < n)
{
y[iy] = x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(0);
}
#endif
#undef COPY
#ifdef COMPLEX
#ifdef DOUBLE
#define COPY BLASFUNC(zcopy)
#else
#define COPY BLASFUNC(ccopy)
#endif
#else
#ifdef DOUBLE
#define COPY BLASFUNC(dcopy)
#else
#define COPY BLASFUNC(scopy)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y, *y_c;
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg,timeg_c;
blasint iy;
int test = 1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops Time CTime Test\n");
for(m = from; m <= to; m += step)
{
timeg=0;
timeg_c=0;
fprintf(stderr, " %6d :", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
y_c[i] = y[i];
}
gettimeofday( &start, (struct timezone *)0);
COPY (&m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
gettimeofday( &start, (struct timezone *)0);
#ifdef COMPLEX
zcopy_c(m, x, inc_x, y_c, inc_y);
#else
copy_c(m, x, inc_x, y_c, inc_y);
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;
iy = 0;
#ifdef COMPLEX
for (i = 0; i < m * 2; i++)
#else
for (i = 0; i < m; i++)
#endif
{
test &= assert_dbl_near(y[iy], y_c[iy], SINGLE_EPS);
iy += inc_y;
}
}
timeg /= loops;
timeg_c /= loops;
#ifdef COMPLEX
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 6. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#else
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#endif
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

296
ztest/dot.c Normal file
View File

@ -0,0 +1,296 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13
int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}
#ifdef COMPLEX
OPENBLAS_COMPLEX_FLOAT zdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT dot[2];
OPENBLAS_COMPLEX_FLOAT result;
BLASLONG inc_x2;
BLASLONG inc_y2;
dot[0]=0.0;
dot[1]=0.0;
CREAL(result) = 0.0 ;
CIMAG(result) = 0.0 ;
if ( n < 1 ) return(result);
inc_x2 = 2 * inc_x ;
inc_y2 = 2 * inc_y ;
while(i < n)
{
#if !defined(CONJ)
dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ;
dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ;
#else
dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ;
dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ;
#endif
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
CREAL(result) = dot[0];
CIMAG(result) = dot[1];
return(result);
}
#else
FLOAT dot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT dot = 0.0 ;
if ( n < 0 ) return(dot);
while(i < n)
{
dot += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(dot);
}
#endif
#undef DOT
#ifdef COMPLEX
#ifdef DOUBLE
#define DOT BLASFUNC(zdotu)
#else
#define DOT BLASFUNC(cdotu)
#endif
#else
#ifdef DOUBLE
#define DOT BLASFUNC(ddot)
#else
#define DOT BLASFUNC(sdot)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
#ifdef COMPLEX
OPENBLAS_COMPLEX_FLOAT result, result_c;
#else
FLOAT result, result_c;
#endif
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg,timeg_c;
int test = 1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops Time CTime Test\n");
for(m = from; m <= to; m += step)
{
timeg=0;
timeg_c=0;
fprintf(stderr, " %6d :", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++) {
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++) {
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
result = DOT(&m, x, &inc_x, y, &inc_y);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
gettimeofday( &start, (struct timezone *)0);
#ifdef COMPLEX
result_c = zdot_c(m, x, inc_x, y, inc_y);
#else
result_c = dot_c(m, x, inc_x, y, inc_y);
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;
#ifdef COMPLEX
test &= assert_dbl_near(CREAL(result), CREAL(result_c), SINGLE_EPS);
test &= assert_dbl_near(CIMAG(result), CIMAG(result_c), SINGLE_EPS);
#else
test &= assert_dbl_near(result, result_c, SINGLE_EPS);
#endif
}
timeg /= loops;
timeg_c /= loops;
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

229
ztest/dsdot.c Normal file
View File

@ -0,0 +1,229 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13
int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}
double dsdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
double dot = 0.0 ;
if ( n < 0 ) return(dot);
while(i < n)
{
dot += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(dot);
}
#undef DSDOT
#define DSDOT BLASFUNC(dsdot)
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
double result, result_c;
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg,timeg_c;
int test = 1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops Time CTime Test\n");
for(m = from; m <= to; m += step)
{
timeg=0;
timeg_c=0;
fprintf(stderr, " %6d :", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++) {
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++) {
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
result = DSDOT(&m, x, &inc_x, y, &inc_y);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
gettimeofday( &start, (struct timezone *)0);
result_c = dsdot_c(m, x, inc_x, y, inc_y);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;
test &= assert_dbl_near(result, result_c, SINGLE_EPS);
}
timeg /= loops;
timeg_c /= loops;
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

618
ztest/gemv.c Normal file
View File

@ -0,0 +1,618 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13
int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}
#ifdef COMPLEX
int zgemv_n_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT temp_r,temp_i;
BLASLONG inc_x2,inc_y2;
BLASLONG lda2;
BLASLONG i2;
lda2 = 2*lda;
ix = 0;
a_ptr = a;
if ( inc_x == 1 && inc_y == 1 )
{
for (j=0; j<n; j++)
{
#if !defined(XCONJ)
temp_r = alpha_r * x[ix] - alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
#else
temp_r = alpha_r * x[ix] + alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
#endif
iy = 0;
i2=0;
for (i=0; i<m; i++)
{
#if !defined(CONJ)
#if !defined(XCONJ)
printf("\nParO: %f %f %f %f\n", a_ptr[i2], a_ptr[i2+1], temp_r, temp_i);
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
#else
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
#endif
#else
#if !defined(XCONJ)
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
#else
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
#endif
#endif
i2 += 2;
iy += 2;
}
a_ptr += lda2;
ix += 2;
}
return(0);
}
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
for (j=0; j<n; j++)
{
#if !defined(XCONJ)
temp_r = alpha_r * x[ix] - alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
#else
temp_r = alpha_r * x[ix] + alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
#endif
iy = 0;
i2=0;
for (i=0; i<m; i++)
{
#if !defined(CONJ)
#if !defined(XCONJ)
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
#else
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
#endif
#else
#if !defined(XCONJ)
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
#else
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
#endif
#endif
i2 += 2;
iy += inc_y2;
}
a_ptr += lda2;
ix += inc_x2;
}
return(0);
}
int zgemv_t_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT temp_r,temp_i;
BLASLONG inc_x2,inc_y2;
BLASLONG lda2;
BLASLONG i2;
lda2 = 2*lda;
iy = 0;
a_ptr = a;
if ( inc_x == 1 && inc_y == 1 )
{
for (j=0; j<n; j++)
{
temp_r = 0.0;
temp_i = 0.0;
ix = 0;
i2=0;
for (i=0; i<m; i++)
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[i2] * x[ix] - a_ptr[i2+1] * x[ix+1];
temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
#else
temp_r += a_ptr[i2] * x[ix] + a_ptr[i2+1] * x[ix+1];
temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
#endif
i2 += 2;
ix += 2;
}
#if !defined(XCONJ)
y[iy] += alpha_r * temp_r - alpha_i * temp_i;
y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y[iy] += alpha_r * temp_r + alpha_i * temp_i;
y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
a_ptr += lda2;
iy += 2;
}
return(0);
}
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
for (j=0; j<n; j++)
{
temp_r = 0.0;
temp_i = 0.0;
ix = 0;
i2=0;
for (i=0; i<m; i++)
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[i2] * x[ix] - a_ptr[i2+1] * x[ix+1];
temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
#else
temp_r += a_ptr[i2] * x[ix] + a_ptr[i2+1] * x[ix+1];
temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
#endif
i2 += 2;
ix += inc_x2;
}
#if !defined(XCONJ)
y[iy] += alpha_r * temp_r - alpha_i * temp_i;
y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y[iy] += alpha_r * temp_r + alpha_i * temp_i;
y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
a_ptr += lda2;
iy += inc_y2;
}
return(0);
}
#else
int gemv_n_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT temp;
ix = 0;
a_ptr = a;
for (j=0; j<n; j++)
{
temp = alpha * x[ix];
iy = 0;
for (i=0; i<m; i++)
{
y[iy] += temp * a_ptr[i];
iy += inc_y;
}
a_ptr += lda;
ix += inc_x;
}
return(0);
}
int gemv_t_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT temp;
iy = 0;
a_ptr = a;
for (j=0; j<n; j++)
{
temp = 0.0;
ix = 0;
for (i=0; i<m; i++)
{
temp += a_ptr[i] * x[ix];
ix += inc_x;
}
y[iy] += alpha * temp;
iy += inc_y;
a_ptr += lda;
}
return(0);
}
#endif
#undef GEMV
#ifndef COMPLEX
#ifdef DOUBLE
#define GEMV BLASFUNC(dgemv)
#else
#define GEMV BLASFUNC(sgemv)
#endif
#else
#ifdef DOUBLE
#define GEMV BLASFUNC(zgemv)
#else
#define GEMV BLASFUNC(cgemv)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *x, *y, *y_c;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
char trans='N';
blasint m, i, j;
blasint inc_x=1,inc_y=1;
blasint n=0;
int has_param_n = 0;
int has_param_m = 0;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg,timeg_c;
blasint iy;
int test = 1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
int tomax = to;
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
if ((p = getenv("OPENBLAS_PARAM_N"))) {
n = atoi(p);
if ((n>0)) has_param_n = 1;
if ( n > tomax ) tomax = n;
}
if ( has_param_n == 0 )
if ((p = getenv("OPENBLAS_PARAM_M"))) {
m = atoi(p);
if ((m>0)) has_param_m = 1;
if ( m > tomax ) tomax = m;
}
fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops Time CTime Test\n");
if (has_param_m == 0)
{
for(m = from; m <= to; m += step)
{
timeg=0;
timeg_c=0;
if ( has_param_n == 0 ) n = m;
fprintf(stderr, " %6dx%d :", (int)m,(int)n);
for(j = 0; j < m; j++){
for(i = 0; i < n * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for (l=0; l<loops; l++)
{
for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
y_c[i]= y[i];
}
gettimeofday( &start, (struct timezone *)0);
GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
gettimeofday( &start, (struct timezone *)0);
#ifdef COMPLEX
if (trans == 'N')
zgemv_n_c(m, n, 0, alpha[0], alpha[1], a, m, x, inc_x, y_c, inc_y);
else
zgemv_t_c(m, n, 0, alpha[0], alpha[1], a, m, x, inc_x, y_c, inc_y);
#else
if (trans == 'N')
gemv_n_c(m, n, 0, *alpha, a, m, x, inc_x, y_c, inc_y);
else
gemv_t_c(m, n, 0, *alpha, a, m, x, inc_x, y_c, inc_y);
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;
iy = 0;
#ifdef COMPLEX
for (i = 0; i < m * 2; i++)
#else
for (i = 0; i < m; i++)
#endif
{
test &= assert_dbl_near(y[iy], y_c[iy], SINGLE_EPS);
iy += inc_y;
}
}
timeg /= loops;
timeg_c /= loops;
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
}
}
else
{
for(n = from; n <= to; n += step)
{
timeg=0;
timeg_c=0;
fprintf(stderr, " %6dx%d :", (int)m,(int)n);
for(j = 0; j < m; j++){
for(i = 0; i < n * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for (l=0; l<loops; l++)
{
for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
y_c[i]= y[i];
}
gettimeofday( &start, (struct timezone *)0);
GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
gettimeofday( &start, (struct timezone *)0);
#ifdef COMPLEX
if (trans == 'N')
zgemv_n_c(m, n, 0, alpha[0], alpha[1], a, m, x, inc_x, y_c, inc_y);
else
zgemv_t_c(m, n, 0, alpha[0], alpha[1], a, m, x, inc_x, y_c, inc_y);
#else
if (trans == 'N')
gemv_n_c(m, n, 0, *alpha, a, m, x, inc_x, y_c, inc_y);
else
gemv_t_c(m, n, 0, *alpha, a, m, x, inc_x, y_c, inc_y);
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;
iy = 0;
#ifdef COMPLEX
for (i = 0; i < m * 2; i++)
#else
for (i = 0; i < m; i++)
#endif
{
test &= assert_dbl_near(y[iy], y_c[iy], SINGLE_EPS);
iy += inc_y;
}
}
timeg /= loops;
timeg_c /= loops;
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
}
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

284
ztest/iamax.c Normal file
View File

@ -0,0 +1,284 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13
int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#ifdef COMPLEX
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
BLASLONG izamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf;
BLASLONG max=0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(max);
inc_x2 = 2 * inc_x;
maxf = CABS1(x,0);
ix += inc_x2;
i++;
while(i < n)
{
if( CABS1(x,ix) > maxf )
{
max = i;
maxf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return(max+1);
}
#else
BLASLONG iamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf=0.0;
BLASLONG max=0;
if (n <= 0 || inc_x <= 0) return(max);
maxf=ABS(x[0]);
ix += inc_x;
i++;
while(i < n)
{
if( ABS(x[ix]) > maxf )
{
max = i;
maxf = ABS(x[ix]);
}
ix += inc_x;
i++;
}
return(max+1);
}
#endif
#undef IAMAX
#ifdef COMPLEX
#ifdef DOUBLE
#define IAMAX BLASFUNC(izamax)
#else
#define IAMAX BLASFUNC(icamax)
#endif
#else
#ifdef DOUBLE
#define IAMAX BLASFUNC(idamax)
#else
#define IAMAX BLASFUNC(isamax)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
BLASLONG result, result_c;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg,timeg_c;
int test = 1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops Time CTime Test\n");
for(m = from; m <= to; m += step)
{
timeg=0;
timeg_c=0;
fprintf(stderr, " %6d :", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
result = IAMAX (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
gettimeofday( &start, (struct timezone *)0);
#ifdef COMPLEX
result_c = izamax_c(m, x, inc_x);
#else
result_c = iamax_c(m, x, inc_x);
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;
test &= (result == result_c);
}
timeg /= loops;
timeg_c /= loops;
#ifdef COMPLEX
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 6. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#else
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#endif
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

284
ztest/iamin.c Normal file
View File

@ -0,0 +1,284 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13
int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#ifdef COMPLEX
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
BLASLONG izamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf;
BLASLONG min=0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(min);
inc_x2 = 2 * inc_x;
minf = CABS1(x,0);
ix += inc_x2;
i++;
while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return(min+1);
}
#else
BLASLONG iamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf=0.0;
BLASLONG min=0;
if (n <= 0 || inc_x <= 0) return(min);
minf=ABS(x[0]);
ix += inc_x;
i++;
while(i < n)
{
if( ABS(x[ix]) < minf )
{
min = i;
minf = ABS(x[ix]);
}
ix += inc_x;
i++;
}
return(min+1);
}
#endif
#undef IAMIN
#ifdef COMPLEX
#ifdef DOUBLE
#define IAMIN BLASFUNC(izamin)
#else
#define IAMIN BLASFUNC(icamin)
#endif
#else
#ifdef DOUBLE
#define IAMIN BLASFUNC(idamin)
#else
#define IAMIN BLASFUNC(isamin)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
BLASLONG result, result_c;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg,timeg_c;
int test = 1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops Time CTime Test\n");
for(m = from; m <= to; m += step)
{
timeg=0;
timeg_c=0;
fprintf(stderr, " %6d :", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
result = IAMIN (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
gettimeofday( &start, (struct timezone *)0);
#ifdef COMPLEX
result_c = izamin_c(m, x, inc_x);
#else
result_c = iamin_c(m, x, inc_x);
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;
test &= (result == result_c);
}
timeg /= loops;
timeg_c /= loops;
#ifdef COMPLEX
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 6. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#else
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#endif
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

231
ztest/imax.c Normal file
View File

@ -0,0 +1,231 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13
int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}
BLASLONG imax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf=0.0;
BLASLONG max=0;
if (n <= 0 || inc_x <= 0) return(max);
maxf=x[0];
ix += inc_x;
i++;
while(i < n)
{
if( x[ix] > maxf )
{
max = i;
maxf = x[ix];
}
ix += inc_x;
i++;
}
return(max+1);
}
#undef IMAX
#ifdef DOUBLE
#define IMAX BLASFUNC(idmax)
#else
#define IMAX BLASFUNC(ismax)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
BLASLONG result, result_c;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg,timeg_c;
int test = 1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops Time CTime Test\n");
for(m = from; m <= to; m += step)
{
timeg=0;
timeg_c=0;
fprintf(stderr, " %6d :", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
result = IMAX (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
gettimeofday( &start, (struct timezone *)0);
result_c = imax_c(m, x, inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;
test &= (result == result_c);
}
timeg /= loops;
timeg_c /= loops;
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

231
ztest/imin.c Normal file
View File

@ -0,0 +1,231 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13
int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}
BLASLONG imin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf=0.0;
BLASLONG min=0;
if (n <= 0 || inc_x <= 0) return(min);
minf=x[0];
ix += inc_x;
i++;
while(i < n)
{
if( x[ix] < minf )
{
min = i;
minf = x[ix];
}
ix += inc_x;
i++;
}
return(min+1);
}
#undef IMIN
#ifdef DOUBLE
#define IMIN BLASFUNC(idmin)
#else
#define IMIN BLASFUNC(ismin)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
BLASLONG result, result_c;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg,timeg_c;
int test = 1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops Time CTime Test\n");
for(m = from; m <= to; m += step)
{
timeg=0;
timeg_c=0;
fprintf(stderr, " %6d :", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
result = IMIN (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
gettimeofday( &start, (struct timezone *)0);
result_c = imin_c(m, x, inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;
test &= (result == result_c);
}
timeg /= loops;
timeg_c /= loops;
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

229
ztest/max.c Normal file
View File

@ -0,0 +1,229 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13
int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}
FLOAT max_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf=0.0;
if (n <= 0 || inc_x <= 0) return(maxf);
maxf=x[0];
ix += inc_x;
i++;
while(i < n)
{
if( x[ix] > maxf )
{
maxf = x[ix];
}
ix += inc_x;
i++;
}
return(maxf);
}
#undef MAX_
#ifdef DOUBLE
#define MAX_ BLASFUNC(dmax)
#else
#define MAX_ BLASFUNC(smax)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
FLOAT result, result_c;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg,timeg_c;
int test = 1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops Time CTime Test\n");
for(m = from; m <= to; m += step)
{
timeg=0;
timeg_c=0;
fprintf(stderr, " %6d :", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
result = MAX_ (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
gettimeofday( &start, (struct timezone *)0);
result_c = max_c(m, x, inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;
test &= assert_dbl_near(result, result_c, SINGLE_EPS);
}
timeg /= loops;
timeg_c /= loops;
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

229
ztest/min.c Normal file
View File

@ -0,0 +1,229 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13
int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}
FLOAT min_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf=0.0;
if (n <= 0 || inc_x <= 0) return(minf);
minf=x[0];
ix += inc_x;
i++;
while(i < n)
{
if( x[ix] < minf )
{
minf = x[ix];
}
ix += inc_x;
i++;
}
return(minf);
}
#undef MIN_
#ifdef DOUBLE
#define MIN_ BLASFUNC(dmin)
#else
#define MIN_ BLASFUNC(smin)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
FLOAT result, result_c;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg,timeg_c;
int test = 1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops Time CTime Test\n");
for(m = from; m <= to; m += step)
{
timeg=0;
timeg_c=0;
fprintf(stderr, " %6d :", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
result = MIN_ (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
gettimeofday( &start, (struct timezone *)0);
result_c = min_c(m, x, inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;
test &= assert_dbl_near(result, result_c, SINGLE_EPS);
}
timeg /= loops;
timeg_c /= loops;
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

303
ztest/rot.c Normal file
View File

@ -0,0 +1,303 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13
int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}
#ifdef COMPLEX
int zrot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];
BLASLONG inc_x2;
BLASLONG inc_y2;
if ( n <= 0 ) return(0);
inc_x2 = 2 * inc_x ;
inc_y2 = 2 * inc_y ;
while(i < n)
{
temp[0] = c*x[ix] + s*y[iy] ;
temp[1] = c*x[ix+1] + s*y[iy+1] ;
y[iy] = c*y[iy] - s*x[ix] ;
y[iy+1] = c*y[iy+1] - s*x[ix+1] ;
x[ix] = temp[0] ;
x[ix+1] = temp[1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
return(0);
}
#else
int rot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;
if ( n <= 0 ) return(0);
while(i < n)
{
temp = c*x[ix] + s*y[iy] ;
y[iy] = c*y[iy] - s*x[ix] ;
x[ix] = temp ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(0);
}
#endif
#undef ROT
#ifdef COMPLEX
#ifdef DOUBLE
#define ROT BLASFUNC(zdrot)
#else
#define ROT BLASFUNC(csrot)
#endif
#else
#ifdef DOUBLE
#define ROT BLASFUNC(drot)
#else
#define ROT BLASFUNC(srot)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y, *x_c, *y_c;
// FLOAT result;
blasint m, i;
blasint inc_x=1,inc_y=1;
FLOAT c[1] = { 2.0 };
FLOAT s[1] = { 2.0 };
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg,timeg_c;
blasint ix,iy;
int test = 1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops Time CTime Test\n");
for(m = from; m <= to; m += step)
{
timeg=0;
timeg_c=0;
fprintf(stderr, " %6d :", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
x_c[i] = x[i];
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
y_c[i] = y[i];
}
gettimeofday( &start, (struct timezone *)0);
ROT (&m, x, &inc_x, y, &inc_y, c, s);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
gettimeofday( &start, (struct timezone *)0);
#ifdef COMPLEX
zrot_c(m, x_c, inc_x, y_c, inc_y, *c, *s);
#else
rot_c(m, x_c, inc_x, y_c, inc_y, *c, *s);
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;
ix = 0;
iy = 0;
#ifdef COMPLEX
for (i = 0; i < m * 2; i++)
#else
for (i = 0; i < m; i++)
#endif
{
test &= assert_dbl_near(x[ix], x_c[ix], SINGLE_EPS);
test &= assert_dbl_near(y[iy], y_c[iy], SINGLE_EPS);
ix += inc_x;
iy += inc_y;
}
}
timeg /= loops;
timeg_c /= loops;
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

308
ztest/scal.c Normal file
View File

@ -0,0 +1,308 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13
int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}
#ifdef COMPLEX
int zscal_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG inc_x2;
BLASLONG ip = 0;
FLOAT temp;
if ( (n <= 0) || (inc_x <= 0))
return(0);
inc_x2 = 2 * inc_x;
for ( i=0; i<n; i++ )
{
if ( da_r == 0.0 )
{
if ( da_i == 0.0 )
{
temp = 0.0;
x[ip+1] = 0.0 ;
}
else
{
temp = - da_i * x[ip+1] ;
x[ip+1] = da_i * x[ip] ;
}
}
else
{
if ( da_i == 0.0 )
{
temp = da_r * x[ip] ;
x[ip+1] = da_r * x[ip+1];
}
else
{
temp = da_r * x[ip] - da_i * x[ip+1] ;
x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;
}
}
x[ip] = temp;
ip += inc_x2;
}
return(0);
}
#else
int scal_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0,j=0;
if ( (n <= 0) || (inc_x <= 0))
return(0);
while(j < n)
{
if ( da == 0.0 )
x[i]=0.0;
else
x[i] = da * x[i] ;
i += inc_x ;
j++;
}
return 0;
}
#endif
#undef SCAL
#ifdef COMPLEX
#ifdef DOUBLE
#define SCAL BLASFUNC(zscal)
#else
#define SCAL BLASFUNC(cscal)
#endif
#else
#ifdef DOUBLE
#define SCAL BLASFUNC(dscal)
#else
#define SCAL BLASFUNC(sscal)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *x_c;
FLOAT alpha[2] = { 2.0, 2.0 };
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg,timeg_c;
blasint ix;
int test = 1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops Time CTime Test\n");
for(m = from; m <= to; m += step)
{
timeg=0;
timeg_c=0;
fprintf(stderr, " %6d :", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
x_c[i] = x[i];
}
gettimeofday( &start, (struct timezone *)0);
SCAL (&m, alpha, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
gettimeofday( &start, (struct timezone *)0);
#ifdef COMPLEX
zscal_c(m, 0, 0, alpha[0],alpha[1], x_c, inc_x, NULL, 0, NULL, 0);
#else
scal_c(m, 0, 0, *alpha, x_c, inc_x, NULL, 0, NULL, 0);
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;
ix = 0;
#ifdef COMPLEX
for (i = 0; i < m * 2; i++)
#else
for (i = 0; i < m; i++)
#endif
{
test &= assert_dbl_near(x[ix], x_c[ix], SINGLE_EPS);
ix += inc_x;
}
}
timeg /= loops;
timeg_c /= loops;
#ifdef COMPLEX
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 6. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#else
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#endif
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

306
ztest/swap.c Normal file
View File

@ -0,0 +1,306 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above swapright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above swapright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE SWAPRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13
int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}
#ifdef COMPLEX
int zswap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];
BLASLONG inc_x2;
BLASLONG inc_y2;
if ( n < 0 ) return(0);
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
while(i < n)
{
temp[0] = x[ix] ;
temp[1] = x[ix+1] ;
x[ix] = y[iy] ;
x[ix+1] = y[iy+1] ;
y[iy] = temp[0] ;
y[iy+1] = temp[1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
return(0);
}
#else
int swap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;
if ( n < 0 ) return(0);
while(i < n)
{
temp = x[ix] ;
x[ix] = y[iy] ;
y[iy] = temp ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(0);
}
#endif
#undef SWAP
#ifdef COMPLEX
#ifdef DOUBLE
#define SWAP BLASFUNC(zswap)
#else
#define SWAP BLASFUNC(cswap)
#endif
#else
#ifdef DOUBLE
#define SWAP BLASFUNC(dswap)
#else
#define SWAP BLASFUNC(sswap)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y, *x_c, *y_c;
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg,timeg_c;
blasint ix,iy;
int test = 1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops Time CTime Test\n");
for(m = from; m <= to; m += step)
{
timeg=0;
timeg_c=0;
fprintf(stderr, " %6d :", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
x_c[i] = x[i];
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
y_c[i] = y[i];
}
gettimeofday( &start, (struct timezone *)0);
SWAP (&m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
gettimeofday( &start, (struct timezone *)0);
#ifdef COMPLEX
zswap_c(m, 0, 0, 0, 0, x_c, inc_x, y_c, inc_y, NULL, 0);
#else
swap_c(m, 0, 0, 0, x_c, inc_x, y_c, inc_y, NULL, 0);
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;
ix = 0;
iy = 0;
#ifdef COMPLEX
for (i = 0; i < m * 2; i++)
#else
for (i = 0; i < m; i++)
#endif
{
test &= assert_dbl_near(x[ix], x_c[ix], SINGLE_EPS);
test &= assert_dbl_near(y[ix], y_c[ix], SINGLE_EPS);
ix += inc_x;
iy += inc_y;
}
}
timeg /= loops;
timeg_c /= loops;
#ifdef COMPLEX
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 6. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#else
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#endif
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));