diff --git a/Makefile.zarch b/Makefile.zarch index 9ec9dc79f..47ea1eb71 100644 --- a/Makefile.zarch +++ b/Makefile.zarch @@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector FCOMMON_OPT += -march=z13 -mzvector endif +ifeq ($(CORE), Z14) +CCOMMON_OPT += -march=z14 -mzvector +FCOMMON_OPT += -march=z14 -mzvector +endif diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 4e1935429..0ae32f27d 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -29,40 +29,25 @@ #define CPU_GENERIC 0 #define CPU_Z13 1 +#define CPU_Z14 2 static char *cpuname[] = { "ZARCH_GENERIC", - "Z13" + "Z13", + "Z14" }; static char *cpuname_lower[] = { "zarch_generic", - "z13" + "z13", + "z14" }; int detect(void) { - FILE *infile; - char buffer[512], *p; - - p = (char *)NULL; - infile = fopen("/proc/sysinfo", "r"); - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("Type", buffer, 4)){ - p = strchr(buffer, ':') + 2; -#if 0 - fprintf(stderr, "%s\n", p); -#endif - break; - } - } - - fclose(infile); - - if (strstr(p, "2964")) return CPU_Z13; - if (strstr(p, "2965")) return CPU_Z13; - - return CPU_GENERIC; + // return CPU_GENERIC; + return CPU_Z14; + } void get_libname(void) @@ -107,5 +92,9 @@ void get_cpuconfig(void) printf("#define Z13\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); break; + case CPU_Z14: + printf("#define Z14\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + break; } } diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13 index add628bfe..d39b9d904 100644 --- a/kernel/zarch/KERNEL.Z13 +++ b/kernel/zarch/KERNEL.Z13 @@ -1,18 +1,18 @@ SAMAXKERNEL = ../arm/amax.c -DAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = damax.c CAMAXKERNEL = ../arm/zamax.c -ZAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = zamax.c SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c +DAMINKERNEL = damin.c CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = zamin.c SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c +DMAXKERNEL = dmax.c SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c +DMINKERNEL = dmin.c ISAMAXKERNEL = ../arm/iamax.c IDAMAXKERNEL = idamax.c @@ -25,10 +25,10 @@ ICAMINKERNEL = ../arm/izamin.c IZAMINKERNEL = izamin.c ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = idmax.c ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c +IDMINKERNEL = idmin.c SASUMKERNEL = ../arm/asum.c DASUMKERNEL = dasum.c @@ -74,12 +74,12 @@ ZSWAPKERNEL = zswap.c SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = dgemv_n_4.c CGEMVNKERNEL = ../arm/zgemv_n.c -ZGEMVNKERNEL = zgemv_n_4.c +ZGEMVNKERNEL = ../arm/zgemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c DGEMVTKERNEL = dgemv_t_4.c CGEMVTKERNEL = ../arm/zgemv_t.c -ZGEMVTKERNEL = zgemv_t_4.c +ZGEMVTKERNEL = ../arm/zgemv_t.c STRMMKERNEL = strmm8x4V.S DTRMMKERNEL = trmm8x4V.S diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14 new file mode 100644 index 000000000..fa88b6881 --- /dev/null +++ b/kernel/zarch/KERNEL.Z14 @@ -0,0 +1,146 @@ +SAMAXKERNEL = samax.c +DAMAXKERNEL = damax.c +CAMAXKERNEL = camax.c +ZAMAXKERNEL = zamax.c + +SAMINKERNEL = samin.c +DAMINKERNEL = damin.c +CAMINKERNEL = camin.c +ZAMINKERNEL = zamin.c + +SMAXKERNEL = smax.c +DMAXKERNEL = dmax.c + +SMINKERNEL = smin.c +DMINKERNEL = dmin.c + +ISAMAXKERNEL = isamax.c +IDAMAXKERNEL = idamax.c +ICAMAXKERNEL = icamax.c +IZAMAXKERNEL = izamax.c + +ISAMINKERNEL = isamin.c +IDAMINKERNEL = idamin.c +ICAMINKERNEL = icamin.c +IZAMINKERNEL = izamin.c + +ISMAXKERNEL = ismax.c +IDMAXKERNEL = idmax.c + +ISMINKERNEL = ismin.c +IDMINKERNEL = idmin.c + +SASUMKERNEL = sasum.c +DASUMKERNEL = dasum.c +CASUMKERNEL = casum.c +ZASUMKERNEL = zasum.c + +SAXPYKERNEL = saxpy.c +DAXPYKERNEL = daxpy.c +CAXPYKERNEL = caxpy.c +ZAXPYKERNEL = zaxpy.c + +SCOPYKERNEL = scopy.c +DCOPYKERNEL = dcopy.c +CCOPYKERNEL = ccopy.c +ZCOPYKERNEL = zcopy.c + +SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c +DSDOTKERNEL = dsdot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = srot.c +DROTKERNEL = drot.c +CROTKERNEL = crot.c +ZROTKERNEL = zrot.c + +SSCALKERNEL = sscal.c +DSCALKERNEL = dscal.c +CSCALKERNEL = cscal.c +ZSCALKERNEL = zscal.c + +SSWAPKERNEL = sswap.c +DSWAPKERNEL = dswap.c +CSWAPKERNEL = cswap.c +ZSWAPKERNEL = zswap.c + +SGEMVNKERNEL = sgemv_n_4.c +DGEMVNKERNEL = dgemv_n_4.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = sgemv_t_4.c +DGEMVTKERNEL = dgemv_t_4.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +STRMMKERNEL = strmm8x4V.S +DTRMMKERNEL = trmm8x4V.S +CTRMMKERNEL = ctrmm4x4V.S +ZTRMMKERNEL = ztrmm4x4V.S + +SGEMMKERNEL = strmm8x4V.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + + + +DGEMMKERNEL = gemm8x4V.S +DGEMMINCOPY = ../generic/gemm_ncopy_8.c +DGEMMITCOPY = ../generic/gemm_tcopy_8.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ctrmm4x4V.S +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ztrmm4x4V.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + + + diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c new file mode 100644 index 000000000..6394be769 --- /dev/null +++ b/kernel/zarch/camax.c @@ -0,0 +1,269 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vlef %%v0,0(%2),0 \n\t" + "vlef %%v16,4(%2),0 \n\t" + "vlef %%v0,8(%2),1 \n\t" + "vlef %%v16,12(%2),1 \n\t" + "vlef %%v0,16(%2),2 \n\t" + "vlef %%v16,20(%2),2 \n\t" + "vlef %%v0,24(%2),3 \n\t" + "vlef %%v16,28(%2),3 \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vflpsb %%v16,%%v16 \n\t" + "vfasb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%2) \n\t" + + "vlef %%v16,0(%%r1,%2),0 \n\t" + "vlef %%v17,4(%%r1,%2),0 \n\t" + "vlef %%v16,8(%%r1,%2),1 \n\t" + "vlef %%v17,12(%%r1,%2),1 \n\t" + "vlef %%v16,16(%%r1,%2),2 \n\t" + "vlef %%v17,20(%%r1,%2),2 \n\t" + "vlef %%v16,24(%%r1,%2),3 \n\t" + "vlef %%v17,28(%%r1,%2),3 \n\t" + + "vlef %%v18,32(%%r1,%2),0 \n\t" + "vlef %%v19,36(%%r1,%2),0 \n\t" + "vlef %%v18,40(%%r1,%2),1 \n\t" + "vlef %%v19,44(%%r1,%2),1 \n\t" + "vlef %%v18,48(%%r1,%2),2 \n\t" + "vlef %%v19,52(%%r1,%2),2 \n\t" + "vlef %%v18,56(%%r1,%2),3 \n\t" + "vlef %%v19,30(%%r1,%2),3 \n\t" + + "vlef %%v20,64(%%r1,%2),0 \n\t" + "vlef %%v21,68(%%r1,%2),0 \n\t" + "vlef %%v20,72(%%r1,%2),1 \n\t" + "vlef %%v21,76(%%r1,%2),1 \n\t" + "vlef %%v20,80(%%r1,%2),2 \n\t" + "vlef %%v21,84(%%r1,%2),2 \n\t" + "vlef %%v20,88(%%r1,%2),3 \n\t" + "vlef %%v21,92(%%r1,%2),3 \n\t" + + "vlef %%v22,96(%%r1,%2),0 \n\t" + "vlef %%v23,100(%%r1,%2),0 \n\t" + "vlef %%v22,104(%%r1,%2),1 \n\t" + "vlef %%v23,108(%%r1,%2),1 \n\t" + "vlef %%v22,112(%%r1,%2),2 \n\t" + "vlef %%v23,116(%%r1,%2),2 \n\t" + "vlef %%v22,120(%%r1,%2),3 \n\t" + "vlef %%v23,124(%%r1,%2),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v24,%%v16,%%v17 \n\t" + "vfchsb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchsb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchsb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vlef %%v16,128(%%r1,%2),0 \n\t" + "vlef %%v17,132(%%r1,%2),0 \n\t" + "vlef %%v16,136(%%r1,%2),1 \n\t" + "vlef %%v17,140(%%r1,%2),1 \n\t" + "vlef %%v16,144(%%r1,%2),2 \n\t" + "vlef %%v17,148(%%r1,%2),2 \n\t" + "vlef %%v16,152(%%r1,%2),3 \n\t" + "vlef %%v17,156(%%r1,%2),3 \n\t" + + "vlef %%v18,160(%%r1,%2),0 \n\t" + "vlef %%v19,164(%%r1,%2),0 \n\t" + "vlef %%v18,168(%%r1,%2),1 \n\t" + "vlef %%v19,172(%%r1,%2),1 \n\t" + "vlef %%v18,176(%%r1,%2),2 \n\t" + "vlef %%v19,180(%%r1,%2),2 \n\t" + "vlef %%v18,184(%%r1,%2),3 \n\t" + "vlef %%v19,188(%%r1,%2),3 \n\t" + + "vlef %%v20,192(%%r1,%2),0 \n\t" + "vlef %%v21,196(%%r1,%2),0 \n\t" + "vlef %%v20,200(%%r1,%2),1 \n\t" + "vlef %%v21,204(%%r1,%2),1 \n\t" + "vlef %%v20,208(%%r1,%2),2 \n\t" + "vlef %%v21,212(%%r1,%2),2 \n\t" + "vlef %%v20,216(%%r1,%2),3 \n\t" + "vlef %%v21,220(%%r1,%2),3 \n\t" + + "vlef %%v22,224(%%r1,%2),0 \n\t" + "vlef %%v23,228(%%r1,%2),0 \n\t" + "vlef %%v22,232(%%r1,%2),1 \n\t" + "vlef %%v23,236(%%r1,%2),1 \n\t" + "vlef %%v22,240(%%r1,%2),2 \n\t" + "vlef %%v23,244(%%r1,%2),2 \n\t" + "vlef %%v22,248(%%r1,%2),3 \n\t" + "vlef %%v23,252(%%r1,%2),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v24,%%v16,%%v17 \n\t" + "vfchsb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchsb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchsb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + maxf = camax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=CABS1(x,0); + i++; + } + + while (i < n) { + if (ABS(x[i*2]) > maxf) { + maxf = ABS(x[i*2]); + } + i++; + } + return (maxf); + + } else { + + inc_x2 = 2 * inc_x; + maxf=CABS1(x,0); + i += inc_x2; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (CABS1(x,i) > maxf) { + maxf = CABS1(x,i); + } + if (CABS1(x,i+inc_x2) > maxf) { + maxf = CABS1(x,i+inc_x2); + } + if (CABS1(x,i+inc_x2*2) > maxf) { + maxf = CABS1(x,i+inc_x2*2); + } + if (CABS1(x,i+inc_x2*3) > maxf) { + maxf = CABS1(x,i+inc_x2*3); + } + + i += inc_x2 * 4; + + j += 4; + + } + + + while (j < n) { + if (CABS1(x,i) > maxf) { + maxf = CABS1(x,i); + } + i += inc_x2; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c new file mode 100644 index 000000000..936c300c8 --- /dev/null +++ b/kernel/zarch/camin.c @@ -0,0 +1,269 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vlef %%v0,0(%2),0 \n\t" + "vlef %%v16,4(%2),0 \n\t" + "vlef %%v0,8(%2),0 \n\t" + "vlef %%v16,12(%2),0 \n\t" + "vlef %%v0,16(%2),2 \n\t" + "vlef %%v16,20(%2),2 \n\t" + "vlef %%v0,24(%2),3 \n\t" + "vlef %%v16,28(%2),3 \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vflpsb %%v16,%%v16 \n\t" + "vfasb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vlef %%v16,0(%%r1,%2),0 \n\t" + "vlef %%v17,4(%%r1,%2),0 \n\t" + "vlef %%v16,8(%%r1,%2),0 \n\t" + "vlef %%v17,12(%%r1,%2),0 \n\t" + "vlef %%v16,16(%%r1,%2),2 \n\t" + "vlef %%v17,20(%%r1,%2),2 \n\t" + "vlef %%v16,24(%%r1,%2),3 \n\t" + "vlef %%v17,28(%%r1,%2),3 \n\t" + + "vlef %%v18,32(%%r1,%2),0 \n\t" + "vlef %%v19,36(%%r1,%2),0 \n\t" + "vlef %%v18,40(%%r1,%2),0 \n\t" + "vlef %%v19,44(%%r1,%2),0 \n\t" + "vlef %%v18,48(%%r1,%2),2 \n\t" + "vlef %%v19,52(%%r1,%2),2 \n\t" + "vlef %%v18,56(%%r1,%2),3 \n\t" + "vlef %%v19,30(%%r1,%2),3 \n\t" + + "vlef %%v20,64(%%r1,%2),0 \n\t" + "vlef %%v21,68(%%r1,%2),0 \n\t" + "vlef %%v20,72(%%r1,%2),0 \n\t" + "vlef %%v21,76(%%r1,%2),0 \n\t" + "vlef %%v20,80(%%r1,%2),2 \n\t" + "vlef %%v21,84(%%r1,%2),2 \n\t" + "vlef %%v20,88(%%r1,%2),3 \n\t" + "vlef %%v21,92(%%r1,%2),3 \n\t" + + "vlef %%v22,96(%%r1,%2),0 \n\t" + "vlef %%v23,100(%%r1,%2),0 \n\t" + "vlef %%v22,104(%%r1,%2),0 \n\t" + "vlef %%v23,108(%%r1,%2),0 \n\t" + "vlef %%v22,112(%%r1,%2),2 \n\t" + "vlef %%v23,116(%%r1,%2),2 \n\t" + "vlef %%v22,120(%%r1,%2),3 \n\t" + "vlef %%v23,124(%%r1,%2),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchsb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchsb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vlef %%v16,128(%%r1,%2),0 \n\t" + "vlef %%v17,132(%%r1,%2),0 \n\t" + "vlef %%v16,136(%%r1,%2),0 \n\t" + "vlef %%v17,140(%%r1,%2),0 \n\t" + "vlef %%v16,144(%%r1,%2),2 \n\t" + "vlef %%v17,148(%%r1,%2),2 \n\t" + "vlef %%v16,152(%%r1,%2),3 \n\t" + "vlef %%v17,156(%%r1,%2),3 \n\t" + + "vlef %%v18,160(%%r1,%2),0 \n\t" + "vlef %%v19,164(%%r1,%2),0 \n\t" + "vlef %%v18,168(%%r1,%2),0 \n\t" + "vlef %%v19,172(%%r1,%2),0 \n\t" + "vlef %%v18,176(%%r1,%2),2 \n\t" + "vlef %%v19,180(%%r1,%2),2 \n\t" + "vlef %%v18,184(%%r1,%2),3 \n\t" + "vlef %%v19,188(%%r1,%2),3 \n\t" + + "vlef %%v20,192(%%r1,%2),0 \n\t" + "vlef %%v21,196(%%r1,%2),0 \n\t" + "vlef %%v20,200(%%r1,%2),0 \n\t" + "vlef %%v21,204(%%r1,%2),0 \n\t" + "vlef %%v20,208(%%r1,%2),2 \n\t" + "vlef %%v21,212(%%r1,%2),2 \n\t" + "vlef %%v20,216(%%r1,%2),3 \n\t" + "vlef %%v21,220(%%r1,%2),3 \n\t" + + "vlef %%v22,224(%%r1,%2),0 \n\t" + "vlef %%v23,228(%%r1,%2),0 \n\t" + "vlef %%v22,232(%%r1,%2),0 \n\t" + "vlef %%v23,236(%%r1,%2),0 \n\t" + "vlef %%v22,240(%%r1,%2),2 \n\t" + "vlef %%v23,244(%%r1,%2),2 \n\t" + "vlef %%v22,248(%%r1,%2),3 \n\t" + "vlef %%v23,252(%%r1,%2),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchsb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchsb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = camin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=CABS1(x,0); + i++; + } + + while (i < n) { + if (ABS(x[i*2]) < minf) { + minf = ABS(x[i*2]); + } + i++; + } + return (minf); + + } else { + + inc_x2 = 2 * inc_x; + minf=CABS1(x,0); + i += inc_x2; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (CABS1(x,i) < minf) { + minf = CABS1(x,i); + } + if (CABS1(x,i+inc_x2) < minf) { + minf = CABS1(x,i+inc_x2); + } + if (CABS1(x,i+inc_x2*2) < minf) { + minf = CABS1(x,i+inc_x2*2); + } + if (CABS1(x,i+inc_x2*3) < minf) { + minf = CABS1(x,i+inc_x2*3); + } + + i += inc_x2 * 4; + + j += 4; + + } + + + while (j < n) { + if (CABS1(x,i) < minf) { + minf = CABS1(x,i); + } + i += inc_x2; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/casum.c b/kernel/zarch/casum.c new file mode 100644 index 000000000..f4ebc21bd --- /dev/null +++ b/kernel/zarch/casum.c @@ -0,0 +1,167 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT asum; + + __asm__ ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + "vl %%v20, 64(%%r1,%2) \n\t" + "vl %%v21, 80(%%r1,%2) \n\t" + "vl %%v22, 96(%%r1,%2) \n\t" + "vl %%v23, 112(%%r1,%2) \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfasb %%v0,%%v0,%%v16 \n\t" + "vfasb %%v1,%%v1,%%v17 \n\t" + "vfasb %%v2,%%v2,%%v18 \n\t" + "vfasb %%v3,%%v3,%%v19 \n\t" + "vfasb %%v0,%%v0,%%v20 \n\t" + "vfasb %%v1,%%v1,%%v21 \n\t" + "vfasb %%v2,%%v2,%%v22 \n\t" + "vfasb %%v3,%%v3,%%v23 \n\t" + + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + "vl %%v20, 192(%%r1,%2) \n\t" + "vl %%v21, 208(%%r1,%2) \n\t" + "vl %%v22, 224(%%r1,%2) \n\t" + "vl %%v23, 240(%%r1,%2) \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfasb %%v0,%%v0,%%v16 \n\t" + "vfasb %%v1,%%v1,%%v17 \n\t" + "vfasb %%v2,%%v2,%%v18 \n\t" + "vfasb %%v3,%%v3,%%v19 \n\t" + "vfasb %%v0,%%v0,%%v20 \n\t" + "vfasb %%v1,%%v1,%%v21 \n\t" + "vfasb %%v2,%%v2,%%v22 \n\t" + "vfasb %%v3,%%v3,%%v23 \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vfasb %%v0,%%v0,%%v2 \n\t" + "vfasb %%v0,%%v0,%%v3 \n\t" + "veslg %%v1,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vrepf %%v1,%%v0,2 \n\t" + "aebr %%f0,%%f1 \n\t" + "ler %0,%%f0 " + :"=f"(asum) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" + ); + + return asum; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ip=0; + FLOAT sumf = 0.0; + BLASLONG n1; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) + { + + n1 = n & -32; + if ( n1 > 0 ) + { + + sumf = casum_kernel_32(n1, x); + i=n1; + ip=2*n1; + } + + while(i < n) + { + sumf += ABS(x[ip]) + ABS(x[ip+1]); + i++; + ip+=2; + } + + } + else + { + inc_x2 = 2* inc_x; + + while(i < n) + { + sumf += ABS(x[ip]) + ABS(x[ip+1]); + ip+=inc_x2; + i++; + } + + } + return(sumf); +} + + diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c new file mode 100644 index 000000000..2176f3dcd --- /dev/null +++ b/kernel/zarch/caxpy.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile( +#if !defined(CONJ) + "vlrepf %%v0,0(%3) \n\t" + "vlef %%v1,4(%3),0 \n\t" + "vlef %%v1,4(%3),2 \n\t" + "vflcsb %%v1,%%v1 \n\t" + "vlef %%v1,4(%3),1 \n\t" + "vlef %%v1,4(%3),3 \n\t" +#else + "vlef %%v0,0(%3),1 \n\t" + "vlef %%v0,0(%3),3 \n\t" + "vflcsb %%v0,%%v0 \n\t" + "vlef %%v0,0(%3),0 \n\t" + "vlef %%v0,0(%3),2 \n\t" + "vlrepf %%v1,4(%3) \n\t" +#endif + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,0(%%r1,%2) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,32(%%r1,%2) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "verllg %%v24,%%v16,32 \n\t" + "verllg %%v25,%%v17,32 \n\t" + "verllg %%v26,%%v18,32 \n\t" + "verllg %%v27,%%v19,32 \n\t" + + "vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" + + "vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" + + "vst %%v28,0(%%r1,%2) \n\t" + "vst %%v29,16(%%r1,%2) \n\t" + "vst %%v30,32(%%r1,%2) \n\t" + "vst %%v31,48(%%r1,%2) \n\t" + + "vl %%v16,64(%%r1,%1) \n\t" + "vl %%v17,80(%%r1,%1) \n\t" + "vl %%v18,96(%%r1,%1) \n\t" + "vl %%v19,112(%%r1,%1) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "verllg %%v24,%%v16,32 \n\t" + "verllg %%v25,%%v17,32 \n\t" + "verllg %%v26,%%v18,32 \n\t" + "verllg %%v27,%%v19,32 \n\t" + + "vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" + + "vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" + + "vst %%v28,64(%%r1,%2) \n\t" + "vst %%v29,80(%%r1,%2) \n\t" + "vst %%v30,96(%%r1,%2) \n\t" + "vst %%v31,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT da[2]; + + if (n <= 0) return (0); + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -16; + + if (n1) { + da[0] = da_r; + da[1] = da_i; + caxpy_kernel_16(n1, x, y, da); + ix = 2 * n1; + } + i = n1; + while (i < n) { +#if !defined(CONJ) + y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); + y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); +#else + y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); + y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); +#endif + i++; + ix += 2; + + } + return (0); + + + } + + inc_x *= 2; + inc_y *= 2; + + while (i < n) { + +#if !defined(CONJ) + y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); + y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); +#else + y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); + y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); +#endif + ix += inc_x; + iy += inc_y; + i++; + + } + return (0); + +} + + diff --git a/kernel/zarch/ccopy.c b/kernel/zarch/ccopy.c new file mode 100644 index 000000000..fc0b8d648 --- /dev/null +++ b/kernel/zarch/ccopy.c @@ -0,0 +1,99 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "lgr %%r1,%1 \n\t" + "lgr %%r2,%2 \n\t" + "srlg %%r0,%0,5 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1) \n\t" + "pfd 2, 1024(%%r2) \n\t" + "mvc 0(256,%%r2),0(%%r1) \n\t" + "agfi %%r1,256 \n\t" + "agfi %%r2,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","r2" + ); +} + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1 )) + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + ccopy_kernel_32(n1, x, y); + i=n1; + ix=n1*2; + iy=n1*2; + } + + while(i < n) + { + y[iy] = x[iy] ; + y[iy+1] = x[ix+1] ; + ix+=2; + iy+=2; + i++ ; + + } + + + } + else + { + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + + return(0); +} diff --git a/kernel/zarch/cdot.c b/kernel/zarch/cdot.c new file mode 100644 index 000000000..3eda2979b --- /dev/null +++ b/kernel/zarch/cdot.c @@ -0,0 +1,182 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "vzero %%v28 \n\t" + "vzero %%v29 \n\t" + "vzero %%v30 \n\t" + "vzero %%v31 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "verllg %%v20,%%v16,32 \n\t" + "verllg %%v21,%%v17,32 \n\t" + "verllg %%v22,%%v18,32 \n\t" + "verllg %%v23,%%v19,32 \n\t" + + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25 \n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26 \n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27 \n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28 \n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29 \n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30 \n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31 \n\t" + + "vl %%v16, 64(%%r1,%1) \n\t" + "vl %%v17, 80(%%r1,%1) \n\t" + "vl %%v18, 96(%%r1,%1) \n\t" + "vl %%v19, 112(%%r1,%1) \n\t" + "vl %%v0, 64(%%r1,%2) \n\t" + "vl %%v1, 80(%%r1,%2) \n\t" + "vl %%v2, 96(%%r1,%2) \n\t" + "vl %%v3, 112(%%r1,%2) \n\t" + "verllg %%v20,%%v16,32 \n\t" + "verllg %%v21,%%v17,32 \n\t" + "verllg %%v22,%%v18,32 \n\t" + "verllg %%v23,%%v19,32 \n\t" + + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25 \n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26 \n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27 \n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28 \n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29 \n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30 \n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vfasb %%v24,%%v24,%%v26 \n\t" + "vfasb %%v24,%%v24,%%v28 \n\t" + "vfasb %%v24,%%v24,%%v30 \n\t" + "vrepg %%v26,%%v24,1 \n\t" + "vfasb %%v24,%%v24,%%v26 \n\t" + "vfasb %%v25,%%v25,%%v27 \n\t" + "vfasb %%v25,%%v25,%%v29 \n\t" + "vfasb %%v25,%%v25,%%v31 \n\t" + "vrepg %%v27,%%v25,1 \n\t" + "vfasb %%v25,%%v25,%%v27 \n\t" + "vstef %%v24,0(%3),0 \n\t" + "vstef %%v24,4(%3),1 \n\t" + "vstef %%v25,8(%3),1 \n\t" + "vstef %%v25,12(%3),0 " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i; + BLASLONG ix, iy; + OPENBLAS_COMPLEX_FLOAT result; + FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; + + if (n <= 0) { + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + return (result); + + } + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -16; + + if (n1) + cdot_kernel_16(n1, x, y, dot); + + i = n1; + BLASLONG j = i * 2; + + while (i < n) { + + dot[0] += x[j] * y[j]; + dot[1] += x[j + 1] * y[j + 1]; + dot[2] += x[j] * y[j + 1]; + dot[3] += x[j + 1] * y[j]; + + j += 2; + i++; + + } + + + } else { + i = 0; + ix = 0; + iy = 0; + inc_x <<= 1; + inc_y <<= 1; + while (i < n) { + + dot[0] += x[ix] * y[iy]; + dot[1] += x[ix + 1] * y[iy + 1]; + dot[2] += x[ix] * y[iy + 1]; + dot[3] += x[ix + 1] * y[iy]; + + ix += inc_x; + iy += inc_y; + i++; + + } + } + +#if !defined(CONJ) + CREAL(result) = dot[0] - dot[1]; + CIMAG(result) = dot[2] + dot[3]; +#else + CREAL(result) = dot[0] + dot[1]; + CIMAG(result) = dot[2] - dot[3]; + +#endif + + return (result); + +} + + diff --git a/kernel/zarch/crot.c b/kernel/zarch/crot.c new file mode 100644 index 000000000..f04a624ac --- /dev/null +++ b/kernel/zarch/crot.c @@ -0,0 +1,256 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) +{ + __asm__ ( + "vlrepf %%v0,%3 \n\t" + "vlrepf %%v1,%4 \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%1) \n\t" + "vst %%v29, 16(%%r1,%1) \n\t" + "vst %%v30, 32(%%r1,%1) \n\t" + "vst %%v31, 48(%%r1,%1) \n\t" + "vst %%v20, 0(%%r1,%2) \n\t" + "vst %%v21, 16(%%r1,%2) \n\t" + "vst %%v22, 32(%%r1,%2) \n\t" + "vst %%v23, 48(%%r1,%2) \n\t" + + "vl %%v24, 64(%%r1,%1) \n\t" + "vl %%v25, 80(%%r1,%1) \n\t" + "vl %%v26, 96(%%r1,%1) \n\t" + "vl %%v27, 112(%%r1,%1) \n\t" + "vl %%v16, 64(%%r1,%2) \n\t" + "vl %%v17, 80(%%r1,%2) \n\t" + "vl %%v18, 96(%%r1,%2) \n\t" + "vl %%v19, 112(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%1) \n\t" + "vst %%v29, 80(%%r1,%1) \n\t" + "vst %%v30, 96(%%r1,%1) \n\t" + "vst %%v31, 112(%%r1,%1) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%1) \n\t" + "vst %%v29, 144(%%r1,%1) \n\t" + "vst %%v30, 160(%%r1,%1) \n\t" + "vst %%v31, 176(%%r1,%1) \n\t" + "vst %%v20, 128(%%r1,%2) \n\t" + "vst %%v21, 144(%%r1,%2) \n\t" + "vst %%v22, 160(%%r1,%2) \n\t" + "vst %%v23, 176(%%r1,%2) \n\t" + + "vl %%v24, 192(%%r1,%1) \n\t" + "vl %%v25, 208(%%r1,%1) \n\t" + "vl %%v26, 224(%%r1,%1) \n\t" + "vl %%v27, 240(%%r1,%1) \n\t" + "vl %%v16, 192(%%r1,%2) \n\t" + "vl %%v17, 208(%%r1,%2) \n\t" + "vl %%v18, 224(%%r1,%2) \n\t" + "vl %%v19, 240(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 192(%%r1,%1) \n\t" + "vst %%v29, 208(%%r1,%1) \n\t" + "vst %%v30, 224(%%r1,%1) \n\t" + "vst %%v31, 240(%%r1,%1) \n\t" + "vst %%v20, 192(%%r1,%2) \n\t" + "vst %%v21, 208(%%r1,%2) \n\t" + "vst %%v22, 224(%%r1,%2) \n\t" + "vst %%v23, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + FLOAT cosa,sina; + cosa=c; + sina=s; + crot_kernel_32(n1, x, y, &cosa, &sina); + i=n1; + ix=2*n1; + } + + while(i < n) + { + temp[0] = c*x[ix] + s*y[ix] ; + temp[1] = c*x[ix+1] + s*y[ix+1] ; + y[ix] = c*y[ix] - s*x[ix] ; + y[ix+1] = c*y[ix+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += 2 ; + i++ ; + + } + + + } + else + { + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + +} + + diff --git a/kernel/zarch/cscal.c b/kernel/zarch/cscal.c new file mode 100644 index 000000000..0c15c5add --- /dev/null +++ b/kernel/zarch/cscal.c @@ -0,0 +1,456 @@ +/*************************************************************************** +Copyright (c) 2013 - 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlrepf %%v0,0(%1) \n\t" + "vlef %%v1,4(%1),0 \n\t" + "vlef %%v1,4(%1),2 \n\t" + "vflcsb %%v1,%%v1 \n\t" + "vlef %%v1,4(%1),1 \n\t" + "vlef %%v1,4(%1),3 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "verllg %%v24,%%v16,32 \n\t" + "verllg %%v25,%%v17,32 \n\t" + "verllg %%v26,%%v18,32 \n\t" + "verllg %%v27,%%v19,32 \n\t" + "verllg %%v28,%%v20,32 \n\t" + "verllg %%v29,%%v21,32 \n\t" + "verllg %%v30,%%v22,32 \n\t" + "verllg %%v31,%%v23,32 \n\t" + + "vfmsb %%v16,%%v16,%%v0 \n\t" + "vfmsb %%v17,%%v17,%%v0 \n\t" + "vfmsb %%v18,%%v18,%%v0 \n\t" + "vfmsb %%v19,%%v19,%%v0 \n\t" + "vfmsb %%v20,%%v20,%%v0 \n\t" + "vfmsb %%v21,%%v21,%%v0 \n\t" + "vfmsb %%v22,%%v22,%%v0 \n\t" + "vfmsb %%v23,%%v23,%%v0 \n\t" + "vfmasb %%v16,%%v24,%%v1,%%v16 \n\t" + "vfmasb %%v17,%%v25,%%v1,%%v17 \n\t" + "vfmasb %%v18,%%v26,%%v1,%%v18 \n\t" + "vfmasb %%v19,%%v27,%%v1,%%v19 \n\t" + "vfmasb %%v20,%%v28,%%v1,%%v20 \n\t" + "vfmasb %%v21,%%v29,%%v1,%%v21 \n\t" + "vfmasb %%v22,%%v30,%%v1,%%v22 \n\t" + "vfmasb %%v23,%%v31,%%v1,%%v23 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlef %%v0,4(%1),0 \n\t" + "vlef %%v0,4(%1),2 \n\t" + "vflcsb %%v0,%%v0 \n\t" + "vlef %%v0,4(%1),1 \n\t" + "vlef %%v0,4(%1),3 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "verllg %%v16,%%v16,32 \n\t" + "verllg %%v17,%%v17,32 \n\t" + "verllg %%v18,%%v18,32 \n\t" + "verllg %%v19,%%v19,32 \n\t" + "verllg %%v20,%%v20,32 \n\t" + "verllg %%v21,%%v21,32 \n\t" + "verllg %%v22,%%v22,32 \n\t" + "verllg %%v23,%%v23,32 \n\t" + + "vfmsb %%v16,%%v16,%%v0 \n\t" + "vfmsb %%v17,%%v17,%%v0 \n\t" + "vfmsb %%v18,%%v18,%%v0 \n\t" + "vfmsb %%v19,%%v19,%%v0 \n\t" + "vfmsb %%v20,%%v20,%%v0 \n\t" + "vfmsb %%v21,%%v21,%%v0 \n\t" + "vfmsb %%v22,%%v22,%%v0 \n\t" + "vfmsb %%v23,%%v23,%%v0 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); +} + +static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlrepf %%v0,0(%1) \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfmsb %%v16,%%v16,%%v0 \n\t" + "vfmsb %%v17,%%v17,%%v0 \n\t" + "vfmsb %%v18,%%v18,%%v0 \n\t" + "vfmsb %%v19,%%v19,%%v0 \n\t" + "vfmsb %%v20,%%v20,%%v0 \n\t" + "vfmsb %%v21,%%v21,%%v0 \n\t" + "vfmsb %%v22,%%v22,%%v0 \n\t" + "vfmsb %%v23,%%v23,%%v0 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); +} + +static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) +{ + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + + "vst %%v24,0(%%r1,%1) \n\t" + "vst %%v25,16(%%r1,%1) \n\t" + "vst %%v26,32(%%r1,%1) \n\t" + "vst %%v27,48(%%r1,%1) \n\t" + "vst %%v24,64(%%r1,%1) \n\t" + "vst %%v25,80(%%r1,%1) \n\t" + "vst %%v26,96(%%r1,%1) \n\t" + "vst %%v27,112(%%r1,%1) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v24","v25","v26","v27" + ); +} + +static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_x3 = inc_x2 + inc_x; + FLOAT t0, t1, t2, t3; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + + for (i = 0; i < n; i += 4) + { + t0 = da_r * x[0] - da_i * x[1]; + t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; + t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; + t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; + + x[1] = da_i * x[0] + da_r * x[1]; + x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; + x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; + x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; + + x[0] = t0; + x[inc_x] = t1; + x[inc_x2] = t2; + x[inc_x3] = t3; + + x += 4 * inc_x; + } +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + FLOAT temp0; + FLOAT temp1; + FLOAT alpha[2] __attribute__ ((aligned(16))); + + if (inc_x != 1) { + inc_x <<= 1; + + if (da_r == 0.0) { + + BLASLONG n1 = n & -2; + + if (da_i == 0.0) { + + while (j < n1) { + + x[i] = 0.0; + x[i + 1] = 0.0; + x[i + inc_x] = 0.0; + x[i + 1 + inc_x] = 0.0; + i += 2 * inc_x; + j += 2; + + } + + while (j < n) { + + x[i] = 0.0; + x[i + 1] = 0.0; + i += inc_x; + j++; + + } + + } else { + + while (j < n1) { + + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + temp1 = -da_i * x[i + 1 + inc_x]; + x[i + 1 + inc_x] = da_i * x[i + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; + + } + + while (j < n) { + + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; + + } + + + + } + + } else { + + + if (da_i == 0.0) { + BLASLONG n1 = n & -2; + + while (j < n1) { + + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + temp1 = da_r * x[i + inc_x]; + x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; + + } + + while (j < n) { + + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += inc_x; + j++; + + } + + } else { + + BLASLONG n1 = n & -8; + if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; + cscal_kernel_inc_8(n1, alpha, x, inc_x); + j = n1; + i = n1 * inc_x; + } + + while (j < n) { + + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; + + } + + } + + } + + return (0); + } + + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + alpha[0] = da_r; + alpha[1] = da_i; + + if (da_r == 0.0) + if (da_i == 0) + cscal_kernel_16_zero(n1, x); + else + cscal_kernel_16_zero_r(n1, alpha, x); + else + if (da_i == 0) + cscal_kernel_16_zero_i(n1, alpha, x); + else + cscal_kernel_16(n1, alpha, x); + + i = n1 << 1; + j = n1; + } + + + if (da_r == 0.0) { + + if (da_i == 0.0) { + + while (j < n) { + + x[i] = 0.0; + x[i + 1] = 0.0; + i += 2; + j++; + + } + + } else { + + while (j < n) { + + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += 2; + j++; + + } + + } + + } else { + + if (da_i == 0.0) { + + while (j < n) { + + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += 2; + j++; + + } + + } else { + + while (j < n) { + + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += 2; + j++; + + } + + } + + } + + return (0); +} diff --git a/kernel/zarch/cswap.c b/kernel/zarch/cswap.c new file mode 100644 index 000000000..256995d50 --- /dev/null +++ b/kernel/zarch/cswap.c @@ -0,0 +1,183 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile( + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" + + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vl %%v4, 64(%%r1,%2) \n\t" + "vl %%v5, 80(%%r1,%2) \n\t" + "vl %%v6, 96(%%r1,%2) \n\t" + "vl %%v7, 112(%%r1,%2) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v1, 16(%%r1,%1) \n\t" + "vst %%v2, 32(%%r1,%1) \n\t" + "vst %%v3, 48(%%r1,%1) \n\t" + "vst %%v4, 64(%%r1,%1) \n\t" + "vst %%v5, 80(%%r1,%1) \n\t" + "vst %%v6, 96(%%r1,%1) \n\t" + "vst %%v7, 112(%%r1,%1) \n\t" + + "vl %%v0, 128(%%r1,%2) \n\t" + "vl %%v1, 144(%%r1,%2) \n\t" + "vl %%v2, 160(%%r1,%2) \n\t" + "vl %%v3, 176(%%r1,%2) \n\t" + "vl %%v4, 192(%%r1,%2) \n\t" + "vl %%v5, 208(%%r1,%2) \n\t" + "vl %%v6, 224(%%r1,%2) \n\t" + "vl %%v7, 240(%%r1,%2) \n\t" + "vst %%v0, 128(%%r1,%1) \n\t" + "vst %%v1, 144(%%r1,%1) \n\t" + "vst %%v2, 160(%%r1,%1) \n\t" + "vst %%v3, 176(%%r1,%1) \n\t" + "vst %%v4, 192(%%r1,%1) \n\t" + "vst %%v5, 208(%%r1,%1) \n\t" + "vst %%v6, 224(%%r1,%1) \n\t" + "vst %%v7, 240(%%r1,%1) \n\t" + + "vst %%v16, 0(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2, inc_y2; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1 )) + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + cswap_kernel_32(n1, x, y); + i=n1; + ix = 2* n1; + iy = 2* n1; + } + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += 2 ; + iy += 2 ; + i++ ; + + + } + + + } + else + { + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c new file mode 100644 index 000000000..b74af5d37 --- /dev/null +++ b/kernel/zarch/damax.c @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + maxf = damax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); + + } else { + + maxf=ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c new file mode 100644 index 000000000..4cf5e88b1 --- /dev/null +++ b/kernel/zarch/damin.c @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = damin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); + + } else { + + minf=ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index 7a42a0863..fea431c34 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -23,8 +23,7 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - +*****************************************************************************/ #include "common.h" #include @@ -35,80 +34,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ABS fabsf #endif +static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT asum; - + __asm__ ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + "vl %%v20, 64(%%r1,%2) \n\t" + "vl %%v21, 80(%%r1,%2) \n\t" + "vl %%v22, 96(%%r1,%2) \n\t" + "vl %%v23, 112(%%r1,%2) \n\t" -static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { - FLOAT asum ; - __asm__ ( - "pfd 1, 0(%[ptr_x]) \n\t" - "sllg %%r0,%[n],3 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_temp] ) \n\t" - "vlm %%v24,%%v31, 0(%[ptr_temp] ) \n\t" - - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v0,%%v0,%%v24 \n\t" - "vfadb %%v1,%%v1,%%v25 \n\t" - "vfadb %%v2,%%v2,%%v26 \n\t" - "vfadb %%v3,%%v3,%%v27 \n\t" - "vfadb %%v0,%%v0,%%v28 \n\t" - "vfadb %%v1,%%v1,%%v29 \n\t" - "vfadb %%v2,%%v2,%%v30 \n\t" - "vfadb %%v3,%%v3,%%v31 \n\t" - - "vlm %%v24,%%v31, 128(%[ptr_temp]) \n\t" - - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - "la %[ptr_temp],256(%[ptr_temp]) \n\t" - "vfadb %%v0,%%v0,%%v24 \n\t" - "vfadb %%v1,%%v1,%%v25 \n\t" - "vfadb %%v2,%%v2,%%v26 \n\t" - "vfadb %%v3,%%v3,%%v27 \n\t" - "vfadb %%v0,%%v0,%%v28 \n\t" - "vfadb %%v1,%%v1,%%v29 \n\t" - "vfadb %%v2,%%v2,%%v30 \n\t" - "vfadb %%v3,%%v3,%%v31 \n\t" - - "clgrjl %[ptr_temp],%%r0,1b \n\t" - "vfadb %%v24,%%v0,%%v1 \n\t" - "vfadb %%v25,%%v2,%%v3 \n\t" - "vfadb %%v0,%%v25,%%v24 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %[asum],%%f0 \n\t" - : [asum] "=f"(asum),[ptr_temp] "+&a"(x) - : [mem] "m"( *(const double (*)[n])x ), [n] "r"(n), [ptr_x] "a"(x) - : "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return asum; + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v0,%%v0,%%v16 \n\t" + "vfadb %%v1,%%v1,%%v17 \n\t" + "vfadb %%v2,%%v2,%%v18 \n\t" + "vfadb %%v3,%%v3,%%v19 \n\t" + "vfadb %%v0,%%v0,%%v20 \n\t" + "vfadb %%v1,%%v1,%%v21 \n\t" + "vfadb %%v2,%%v2,%%v22 \n\t" + "vfadb %%v3,%%v3,%%v23 \n\t" + + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + "vl %%v20, 192(%%r1,%2) \n\t" + "vl %%v21, 208(%%r1,%2) \n\t" + "vl %%v22, 224(%%r1,%2) \n\t" + "vl %%v23, 240(%%r1,%2) \n\t" + + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfadb %%v0,%%v0,%%v16 \n\t" + "vfadb %%v1,%%v1,%%v17 \n\t" + "vfadb %%v2,%%v2,%%v18 \n\t" + "vfadb %%v3,%%v3,%%v19 \n\t" + "vfadb %%v0,%%v0,%%v20 \n\t" + "vfadb %%v1,%%v1,%%v21 \n\t" + "vfadb %%v2,%%v2,%%v22 \n\t" + "vfadb %%v3,%%v3,%%v23 \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b \n\t" + "vfadb %%v0,%%v0,%%v1 \n\t" + "vfadb %%v0,%%v0,%%v2 \n\t" + "vfadb %%v0,%%v0,%%v3 \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "ldr %0,%%f0 " + :"=f"(asum) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" + ); + + return asum; } - - - FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; BLASLONG j = 0; diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index 16f82a587..e8823745e 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -25,98 +25,99 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" -#define PREFETCH_INS 1 -#if defined(Z13_A) -#include - -static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) -{ - BLASLONG i = 0; - __vector double v_a = {alpha,alpha}; - __vector double * v_y=(__vector double *)y; - __vector double * v_x=(__vector double *)x; - - for(; i -#endif - -#ifdef HAVE_KERNEL_4x4 - -#elif HAVE_KERNEL_4x4_VEC - static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) { - BLASLONG i; - FLOAT x0,x1,x2,x3; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - __vector double v_x0 = {x0,x0}; - __vector double v_x1 = {x1,x1}; - __vector double v_x2 = {x2,x2}; - __vector double v_x3 = {x3,x3}; - __vector double* v_y =(__vector double*)y; - __vector double* va0 = (__vector double*)ap[0]; - __vector double* va1 = (__vector double*)ap[1]; - __vector double* va2 = (__vector double*)ap[2]; - __vector double* va3 = (__vector double*)ap[3]; + __asm__ volatile ( + "vlrepg %%v0,0(%5) \n\t" + "vlrepg %%v1,8(%5) \n\t" + "vlrepg %%v2,16(%5) \n\t" + "vlrepg %%v3,24(%5) \n\t" + "vlrepg %%v4,%7 \n\t" + "vfmdb %%v0,%%v0,%%v4 \n\t" + "vfmdb %%v1,%%v1,%%v4 \n\t" + "vfmdb %%v2,%%v2,%%v4 \n\t" + "vfmdb %%v3,%%v3,%%v4 \n\t" + "xgr %%r1,%%r1 \n\t" - for ( i=0; i< n/2; i+=2 ) - { - v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; - v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] + v_x2 * va2[i+1] + v_x3 * va3[i+1] ; - } -} + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" -#else + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" -static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3; - FLOAT x[4] __attribute__ ((aligned (16))); - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,0(%%r1,%3) \n\t" + "vl %%v19,0(%%r1,%4) \n\t" + "vl %%v20,16(%%r1,%1) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,16(%%r1,%3) \n\t" + "vl %%v23,16(%%r1,%4) \n\t" + "vl %%v24,32(%%r1,%1) \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vl %%v28,48(%%r1,%1) \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vl %%v31,48(%%r1,%4) \n\t" - for ( i=0; i<4; i++) - x[i] = xo[i] * *alpha; + "vl %%v4,0(%%r1,%6) \n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,0(%%r1,%6) \n\t" - for ( i=0; i< n; i+=4 ) - { - y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; - y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; - y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; - y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; - } -} + "vl %%v4,16(%%r1,%6) \n\t" + "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,16(%%r1,%6) \n\t" + "vl %%v4,32(%%r1,%6) \n\t" + "vfmadb %%v4,%%v24,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v25,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v26,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v27,%%v3,%%v4 \n\t" + "vst %%v4,32(%%r1,%6) \n\t" -#endif + "vl %%v4,48(%%r1,%6) \n\t" + "vfmadb %%v4,%%v28,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v29,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v30,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v31,%%v3,%%v4 \n\t" + "vst %%v4,48(%%r1,%6) \n\t" -#ifdef HAVE_KERNEL_4x2 + "vl %%v16,64(%%r1,%1) \n\t" + "vl %%v17,64(%%r1,%2) \n\t" + "vl %%v18,64(%%r1,%3) \n\t" + "vl %%v19,64(%%r1,%4) \n\t" + "vl %%v20,80(%%r1,%1) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,80(%%r1,%3) \n\t" + "vl %%v23,80(%%r1,%4) \n\t" + "vl %%v24,96(%%r1,%1) \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vl %%v28,112(%%r1,%1) \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vl %%v31,112(%%r1,%4) \n\t" -#elif HAVE_KERNEL_4x2_VEC + "vl %%v4,64(%%r1,%6) \n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,64(%%r1,%6) \n\t" -static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - BLASLONG i; - FLOAT x0,x1; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - __vector double v_x0 = {x0,x0}; - __vector double v_x1 = {x1,x1}; - __vector double* v_y =(__vector double*)y; - __vector double* va0 = (__vector double*)ap[0]; - __vector double* va1 = (__vector double*)ap[1]; + "vl %%v4,80(%%r1,%6) \n\t" + "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,80(%%r1,%6) \n\t" - for ( i=0; i< n/2; i+=2 ) - { - v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; - v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ; - } -} -#else + "vl %%v4,96(%%r1,%6) \n\t" + "vfmadb %%v4,%%v24,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v25,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v26,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v27,%%v3,%%v4 \n\t" + "vst %%v4,96(%%r1,%6) \n\t" -static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - BLASLONG i; - FLOAT *a0,*a1; - FLOAT x[4] __attribute__ ((aligned (16))); - a0 = ap[0]; - a1 = ap[1]; - - for ( i=0; i<2; i++) - x[i] = xo[i] * *alpha; - - for ( i=0; i< n; i+=4 ) - { - y[i] += a0[i]*x[0] + a1[i]*x[1]; - y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1]; - y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1]; - y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1]; - } -} - - -#endif - -#ifdef HAVE_KERNEL_4x1 - -#elif HAVE_KERNEL_4x1_VEC -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0; - x0 = xo[0] * *alpha; - __vector double v_x0 = {x0,x0}; - __vector double* v_y =(__vector double*)y; - __vector double* va0 = (__vector double*)ap; - - for ( i=0; i< n/2; i+=2 ) - { - v_y[i] += v_x0 * va0[i] ; - v_y[i+1] += v_x0 * va0[i+1] ; - } + "vl %%v4,112(%%r1,%6) \n\t" + "vfmadb %%v4,%%v28,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v29,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v30,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v31,%%v3,%%v4 \n\t" + "vst %%v4,112(%%r1,%6) \n\t" - + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,0(%%r1,%3) \n\t" + "vl %%v19,0(%%r1,%4) \n\t" + "vl %%v20,16(%%r1,%1) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,16(%%r1,%3) \n\t" + "vl %%v23,16(%%r1,%4) \n\t" + + "vl %%v4,0(%%r1,%6) \n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,0(%%r1,%6) \n\t" + + "vl %%v4,16(%%r1,%6) \n\t" + "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,16(%%r1,%6) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) { - BLASLONG i; - FLOAT *a0; - FLOAT x[4] __attribute__ ((aligned (16))); - a0 = ap; + __asm__ volatile ( + "vlrepg %%v0,0(%3) \n\t" + "vlrepg %%v1,8(%3) \n\t" + "vlrepg %%v2,%5 \n\t" + "vfmdb %%v0,%%v0,%%v2 \n\t" + "vfmdb %%v1,%%v1,%%v2 \n\t" + "xgr %%r1,%%r1 \n\t" - for ( i=0; i<1; i++) - x[i] = xo[i] * *alpha; + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" - for ( i=0; i< n; i+=4 ) - { - y[i] += a0[i]*x[0]; - y[i+1] += a0[i+1]*x[0]; - y[i+2] += a0[i+2]*x[0]; - y[i+3] += a0[i+3]*x[0]; - } + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,16(%%r1,%1) \n\t" + "vl %%v19,16(%%r1,%2) \n\t" + "vl %%v20,32(%%r1,%1) \n\t" + "vl %%v21,32(%%r1,%2) \n\t" + "vl %%v22,48(%%r1,%1) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "vl %%v24,64(%%r1,%1) \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vl %%v26,80(%%r1,%1) \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vl %%v28,96(%%r1,%1) \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vl %%v30,112(%%r1,%1) \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + + "vl %%v2,0(%%r1,%4) \n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2 \n\t" + "vst %%v2,0(%%r1,%4) \n\t" + + "vl %%v2,16(%%r1,%4) \n\t" + "vfmadb %%v2,%%v18,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v19,%%v1,%%v2 \n\t" + "vst %%v2,16(%%r1,%4) \n\t" + + "vl %%v2,32(%%r1,%4) \n\t" + "vfmadb %%v2,%%v20,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v21,%%v1,%%v2 \n\t" + "vst %%v2,32(%%r1,%4) \n\t" + + "vl %%v2,48(%%r1,%4) \n\t" + "vfmadb %%v2,%%v22,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v23,%%v1,%%v2 \n\t" + "vst %%v2,48(%%r1,%4) \n\t" + + "vl %%v2,64(%%r1,%4) \n\t" + "vfmadb %%v2,%%v24,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v25,%%v1,%%v2 \n\t" + "vst %%v2,64(%%r1,%4) \n\t" + + "vl %%v2,80(%%r1,%4) \n\t" + "vfmadb %%v2,%%v26,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v27,%%v1,%%v2 \n\t" + "vst %%v2,80(%%r1,%4) \n\t" + + "vl %%v2,96(%%r1,%4) \n\t" + "vfmadb %%v2,%%v28,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v29,%%v1,%%v2 \n\t" + "vst %%v2,96(%%r1,%4) \n\t" + + "vl %%v2,112(%%r1,%4) \n\t" + "vfmadb %%v2,%%v30,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v31,%%v1,%%v2 \n\t" + "vst %%v2,112(%%r1,%4) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,16(%%r1,%1) \n\t" + "vl %%v19,16(%%r1,%2) \n\t" + + "vl %%v2,0(%%r1,%4) \n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2 \n\t" + "vst %%v2,0(%%r1,%4) \n\t" + + "vl %%v2,16(%%r1,%4) \n\t" + "vfmadb %%v2,%%v18,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v19,%%v1,%%v2 \n\t" + "vst %%v2,16(%%r1,%4) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vlrepg %%v0,0(%2) \n\t" + "vlrepg %%v1,%4 \n\t" + "vfmdb %%v0,%%v0,%%v1 \n\t" + "xgr %%r1,%%r1 \n\t" -#endif + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" - + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,64(%%r1,%1) \n\t" + "vl %%v21,80(%%r1,%1) \n\t" + "vl %%v22,96(%%r1,%1) \n\t" + "vl %%v23,112(%%r1,%1) \n\t" + + "vl %%v1,0(%%r1,%3) \n\t" + "vfmadb %%v1,%%v16,%%v0,%%v1 \n\t" + "vst %%v1,0(%%r1,%3) \n\t" + + "vl %%v1,16(%%r1,%3) \n\t" + "vfmadb %%v1,%%v17,%%v0,%%v1 \n\t" + "vst %%v1,16(%%r1,%3) \n\t" + + "vl %%v1,32(%%r1,%3) \n\t" + "vfmadb %%v1,%%v18,%%v0,%%v1 \n\t" + "vst %%v1,32(%%r1,%3) \n\t" + + "vl %%v1,48(%%r1,%3) \n\t" + "vfmadb %%v1,%%v19,%%v0,%%v1 \n\t" + "vst %%v1,48(%%r1,%3) \n\t" + + "vl %%v1,64(%%r1,%3) \n\t" + "vfmadb %%v1,%%v20,%%v0,%%v1 \n\t" + "vst %%v1,64(%%r1,%3) \n\t" + + "vl %%v1,80(%%r1,%3) \n\t" + "vfmadb %%v1,%%v21,%%v0,%%v1 \n\t" + "vst %%v1,80(%%r1,%3) \n\t" + + "vl %%v1,96(%%r1,%3) \n\t" + "vfmadb %%v1,%%v22,%%v0,%%v1 \n\t" + "vst %%v1,96(%%r1,%3) \n\t" + + "vl %%v1,112(%%r1,%3) \n\t" + "vfmadb %%v1,%%v23,%%v0,%%v1 \n\t" + "vst %%v1,112(%%r1,%3) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + + "vl %%v1,0(%%r1,%3) \n\t" + "vfmadb %%v1,%%v16,%%v0,%%v1 \n\t" + "vst %%v1,0(%%r1,%3) \n\t" + + "vl %%v1,16(%%r1,%3) \n\t" + "vfmadb %%v1,%%v17,%%v0,%%v1 \n\t" + "vst %%v1,16(%%r1,%3) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { BLASLONG i; - - for ( i=0; i -#endif #define NBMAX 2048 -#ifdef HAVE_KERNEL_4x4 - -#elif HAVE_KERNEL_4x4_VEC - static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - __vector double* va0 = (__vector double*)ap[0]; - __vector double* va1 = (__vector double*)ap[1]; - __vector double* va2 = (__vector double*)ap[2]; - __vector double* va3 = (__vector double*)ap[3]; - __vector double* v_x =(__vector double*)x; - __vector double temp0 = {0,0}; - __vector double temp1 = {0,0}; - __vector double temp2 = {0,0}; - __vector double temp3 = {0,0}; + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "xgr %%r1,%%r1 \n\t" - for ( i=0; i< n/2; i+=2 ) - { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; - temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ; - temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1] ; - temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1] ; - } - - y[0] = temp0[0] + temp0[1]; - y[1] = temp1[0] + temp1[1]; - y[2] = temp2[0] + temp2[1]; - y[3] = temp3[0] + temp3[1];; + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 1,1024(%%r1,%5) \n\t" + + "vl %%v16,0(%%r1,%5) \n\t" + "vl %%v17,16(%%r1,%5) \n\t" + "vl %%v18,32(%%r1,%5) \n\t" + "vl %%v19,48(%%r1,%5) \n\t" + "vl %%v20,64(%%r1,%5) \n\t" + "vl %%v21,80(%%r1,%5) \n\t" + "vl %%v22,96(%%r1,%5) \n\t" + "vl %%v23,112(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3 \n\t" + + "vl %%v28,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v28,%%v0 \n\t" + "vl %%v29,16(%%r1,%2) \n\t" + "vfmadb %%v1,%%v17,%%v29,%%v1 \n\t" + "vl %%v30,16(%%r1,%3) \n\t" + "vfmadb %%v2,%%v17,%%v30,%%v2 \n\t" + "vl %%v31,16(%%r1,%4) \n\t" + "vfmadb %%v3,%%v17,%%v31,%%v3 \n\t" + + "vl %%v24,32(%%r1,%1) \n\t" + "vfmadb %%v0,%%v18,%%v24,%%v0 \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vfmadb %%v1,%%v18,%%v25,%%v1 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2 \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vfmadb %%v3,%%v18,%%v27,%%v3 \n\t" + + "vl %%v28,48(%%r1,%1) \n\t" + "vfmadb %%v0,%%v19,%%v28,%%v0 \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vfmadb %%v1,%%v19,%%v29,%%v1 \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vfmadb %%v2,%%v19,%%v30,%%v2 \n\t" + "vl %%v31,48(%%r1,%4) \n\t" + "vfmadb %%v3,%%v19,%%v31,%%v3 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1 \n\t" + "vl %%v26,64(%%r1,%3) \n\t" + "vfmadb %%v2,%%v20,%%v26,%%v2 \n\t" + "vl %%v27,64(%%r1,%4) \n\t" + "vfmadb %%v3,%%v20,%%v27,%%v3 \n\t" + + "vl %%v28,80(%%r1,%1) \n\t" + "vfmadb %%v0,%%v21,%%v28,%%v0 \n\t" + "vl %%v29,80(%%r1,%2) \n\t" + "vfmadb %%v1,%%v21,%%v29,%%v1 \n\t" + "vl %%v30,80(%%r1,%3) \n\t" + "vfmadb %%v2,%%v21,%%v30,%%v2 \n\t" + "vl %%v31,80(%%r1,%4) \n\t" + "vfmadb %%v3,%%v21,%%v31,%%v3 \n\t" + + "vl %%v24,96(%%r1,%1) \n\t" + "vfmadb %%v0,%%v22,%%v24,%%v0 \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vfmadb %%v1,%%v22,%%v25,%%v1 \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vfmadb %%v2,%%v22,%%v26,%%v2 \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vfmadb %%v3,%%v22,%%v27,%%v3 \n\t" + + "vl %%v28,112(%%r1,%1) \n\t" + "vfmadb %%v0,%%v23,%%v28,%%v0 \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vfmadb %%v1,%%v23,%%v29,%%v1 \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vfmadb %%v2,%%v23,%%v30,%%v2 \n\t" + "vl %%v31,112(%%r1,%4) \n\t" + "vfmadb %%v3,%%v23,%%v31,%%v3 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%5) \n\t" + "vl %%v17,16(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3 \n\t" + + "vl %%v28,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v28,%%v0 \n\t" + "vl %%v29,16(%%r1,%2) \n\t" + "vfmadb %%v1,%%v17,%%v29,%%v1 \n\t" + "vl %%v30,16(%%r1,%3) \n\t" + "vfmadb %%v2,%%v17,%%v30,%%v2 \n\t" + "vl %%v31,16(%%r1,%4) \n\t" + "vfmadb %%v3,%%v17,%%v31,%%v3 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepg %%v4,%%v0,1 \n\t" + "adbr %%f0,%%f4 \n\t" + "std %%f0,0(%6) \n\t" + "vrepg %%v4,%%v1,1 \n\t" + "adbr %%f1,%%f4 \n\t" + "std %%f1,8(%6) \n\t" + "vrepg %%v4,%%v2,1 \n\t" + "adbr %%f2,%%f4 \n\t" + "std %%f2,16(%6) \n\t" + "vrepg %%v4,%%v3,1 \n\t" + "adbr %%f3,%%f4 \n\t" + "std %%f3,24(%6) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else -static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - FLOAT temp3 = 0.0; - - for ( i=0; i< n; i+=4 ) - { - temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; - temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; - temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3]; - temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3]; - } - y[0] = temp0; - y[1] = temp1; - y[2] = temp2; - y[3] = temp3; -} - -#endif - -#ifdef HAVE_KERNEL_4x2 - -#elif HAVE_KERNEL_4x2_VEC static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - __vector double* va0 = (__vector double*)ap[0]; - __vector double* va1 = (__vector double*)ap[1]; - __vector double* v_x =(__vector double*)x; - __vector double temp0 = {0,0}; - __vector double temp1 = {0,0}; + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "xgr %%r1,%%r1 \n\t" - for ( i=0; i< n/2; i+=2 ) - { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; - temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ; - } - - y[0] = temp0[0] + temp0[1]; - y[1] = temp1[0] + temp1[1]; + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" + + "vl %%v26,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v26,%%v0 \n\t" + "vl %%v27,16(%%r1,%2) \n\t" + "vfmadb %%v1,%%v17,%%v27,%%v1 \n\t" + + "vl %%v28,32(%%r1,%1) \n\t" + "vfmadb %%v0,%%v18,%%v28,%%v0 \n\t" + "vl %%v29,32(%%r1,%2) \n\t" + "vfmadb %%v1,%%v18,%%v29,%%v1 \n\t" + + "vl %%v30,48(%%r1,%1) \n\t" + "vfmadb %%v0,%%v19,%%v30,%%v0 \n\t" + "vl %%v31,48(%%r1,%2) \n\t" + "vfmadb %%v1,%%v19,%%v31,%%v1 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1 \n\t" + + "vl %%v26,80(%%r1,%1) \n\t" + "vfmadb %%v0,%%v21,%%v26,%%v0 \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vfmadb %%v1,%%v21,%%v27,%%v1 \n\t" + + "vl %%v28,96(%%r1,%1) \n\t" + "vfmadb %%v0,%%v22,%%v28,%%v0 \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vfmadb %%v1,%%v22,%%v29,%%v1 \n\t" + + "vl %%v30,112(%%r1,%1) \n\t" + "vfmadb %%v0,%%v23,%%v30,%%v0 \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + "vfmadb %%v1,%%v23,%%v31,%%v1 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" + + "vl %%v26,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v26,%%v0 \n\t" + "vl %%v27,16(%%r1,%2) \n\t" + "vfmadb %%v1,%%v17,%%v27,%%v1 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepg %%v2,%%v0,1 \n\t" + "adbr %%f0,%%f2 \n\t" + "std %%f0,0(%4) \n\t" + "vrepg %%v2,%%v1,1 \n\t" + "adbr %%f1,%%f2 \n\t" + "std %%f1,8(%4) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else -static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - - BLASLONG i; - FLOAT *a0,*a1; - a0 = ap[0]; - a1 = ap[1]; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - - for ( i=0; i< n; i+=4 ) - { - temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; - temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; - } - y[0] = temp0; - y[1] = temp1; - -} -#endif - -#ifdef HAVE_KERNEL_4x1 - -#elif HAVE_KERNEL_4x1_VEC static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { - BLASLONG i; - __vector double* va0 = (__vector double*)a0; - __vector double* v_x =(__vector double*)x; - __vector double temp0 = {0,0}; + __asm__ volatile ( + "vzero %%v0 \n\t" + "xgr %%r1,%%r1 \n\t" - for ( i=0; i< n/2; i+=2 ) - { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; - } - - y[0] = temp0[0] + temp0[1]; -} -#else -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) -{ - BLASLONG i; - - - FLOAT temp0 = 0.0; + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" - for ( i=0; i< n; i+=4 ) - { - temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; - } - y[0] = temp0; + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + + "vl %%v25,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" + + "vl %%v26,32(%%r1,%1) \n\t" + "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" + + "vl %%v27,48(%%r1,%1) \n\t" + "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" + + "vl %%v28,64(%%r1,%1) \n\t" + "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" + + "vl %%v29,80(%%r1,%1) \n\t" + "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" + + "vl %%v30,96(%%r1,%1) \n\t" + "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" + + "vl %%v31,112(%%r1,%1) \n\t" + "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + + "vl %%v25,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "std %%f0,0(%3) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#endif - + static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for ( i=0; i 0) { + + maxf = dmax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=x[0]; + i++; + } + + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { + + maxf=x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c new file mode 100644 index 000000000..d7c86735f --- /dev/null +++ b/kernel/zarch/dmin.c @@ -0,0 +1,182 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT min; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return min; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = dmin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=x[0]; + i++; + } + + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { + + minf=x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index bf29538c7..c91f95800 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,176 +27,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA) +static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { - __asm__ ( - "pfd 2, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "lgdr %%r1,%[cos] \n\t" - "vlvgp %%v0,%%r1,%%r1 \n\t" - "lgdr %%r1,%[sin] \n\t" - "vlvgp %%v1,%%r1,%%r1 \n\t" - "srlg %[n_tmp],%[n_tmp],5 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 48(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 112(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 176(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 240(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 240(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 240(%%r1,%[ptr_y]) \n\t" - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[n])x), - [mem_y] "+m" (*(double (*)[n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA) - : "cc", "r1" ,"v0","v1","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - + __asm__ ( + "vlrepg %%v0,%3 \n\t" + "vlrepg %%v1,%4 \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%1) \n\t" + "vst %%v29, 16(%%r1,%1) \n\t" + "vst %%v30, 32(%%r1,%1) \n\t" + "vst %%v31, 48(%%r1,%1) \n\t" + "vst %%v20, 0(%%r1,%2) \n\t" + "vst %%v21, 16(%%r1,%2) \n\t" + "vst %%v22, 32(%%r1,%2) \n\t" + "vst %%v23, 48(%%r1,%2) \n\t" + + "vl %%v24, 64(%%r1,%1) \n\t" + "vl %%v25, 80(%%r1,%1) \n\t" + "vl %%v26, 96(%%r1,%1) \n\t" + "vl %%v27, 112(%%r1,%1) \n\t" + "vl %%v16, 64(%%r1,%2) \n\t" + "vl %%v17, 80(%%r1,%2) \n\t" + "vl %%v18, 96(%%r1,%2) \n\t" + "vl %%v19, 112(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%1) \n\t" + "vst %%v29, 80(%%r1,%1) \n\t" + "vst %%v30, 96(%%r1,%1) \n\t" + "vst %%v31, 112(%%r1,%1) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%1) \n\t" + "vst %%v29, 144(%%r1,%1) \n\t" + "vst %%v30, 160(%%r1,%1) \n\t" + "vst %%v31, 176(%%r1,%1) \n\t" + "vst %%v20, 128(%%r1,%2) \n\t" + "vst %%v21, 144(%%r1,%2) \n\t" + "vst %%v22, 160(%%r1,%2) \n\t" + "vst %%v23, 176(%%r1,%2) \n\t" + + "vl %%v24, 192(%%r1,%1) \n\t" + "vl %%v25, 208(%%r1,%1) \n\t" + "vl %%v26, 224(%%r1,%1) \n\t" + "vl %%v27, 240(%%r1,%1) \n\t" + "vl %%v16, 192(%%r1,%2) \n\t" + "vl %%v17, 208(%%r1,%2) \n\t" + "vl %%v18, 224(%%r1,%2) \n\t" + "vl %%v19, 240(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 192(%%r1,%1) \n\t" + "vst %%v29, 208(%%r1,%1) \n\t" + "vst %%v30, 224(%%r1,%1) \n\t" + "vst %%v31, 240(%%r1,%1) \n\t" + "vst %%v20, 192(%%r1,%2) \n\t" + "vst %%v21, 208(%%r1,%2) \n\t" + "vst %%v22, 224(%%r1,%2) \n\t" + "vst %%v23, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) @@ -214,8 +204,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT BLASLONG n1 = n & -32; if ( n1 > 0 ) { - - drot_kernel_32(n1, x, y, c, s); + FLOAT cosa,sina; + cosa=c; + sina=s; + drot_kernel_32(n1, x, y, &cosa, &sina); i=n1; } @@ -229,6 +221,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } + } else { @@ -250,3 +243,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } + diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index e29f51012..ccc6dd95d 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -27,135 +27,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#ifdef Z13_A -static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x ) +static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { + __asm__ volatile ( + "vlrepg %%v0,%1 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%2) \n\t" + "vfmdb %%v24,%%v24,%%v0 \n\t" + "vst %%v24, 0(%%r1,%2) \n\t" + "vl %%v25, 16(%%r1,%2) \n\t" + "vfmdb %%v25,%%v25,%%v0 \n\t" + "vst %%v25, 16(%%r1,%2) \n\t" + "vl %%v26, 32(%%r1,%2) \n\t" + "vfmdb %%v26,%%v26,%%v0 \n\t" + "vst %%v26, 32(%%r1,%2) \n\t" + "vl %%v27, 48(%%r1,%2) \n\t" + "vfmdb %%v27,%%v27,%%v0 \n\t" + "vst %%v27, 48(%%r1,%2) \n\t" + "vl %%v24, 64(%%r1,%2) \n\t" + "vfmdb %%v24,%%v24,%%v0 \n\t" + "vst %%v24, 64(%%r1,%2) \n\t" + "vl %%v25, 80(%%r1,%2) \n\t" + "vfmdb %%v25,%%v25,%%v0 \n\t" + "vst %%v25, 80(%%r1,%2) \n\t" + "vl %%v26, 96(%%r1,%2) \n\t" + "vfmdb %%v26,%%v26,%%v0 \n\t" + "vst %%v26, 96(%%r1,%2) \n\t" + "vl %%v27, 112(%%r1,%2) \n\t" + "vfmdb %%v27,%%v27,%%v0 \n\t" + "vst %%v27, 112(%%r1,%2) \n\t" + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v24","v25","v26","v27" + ); +} - - __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" - "lgdr %%r0,%[alpha] \n\t" - "vlvgp %%v0,%%r0,%%r0 \n\t" - "srlg %[n],%[n],4 \n\t" - "vlr %%v1,%%v0 \n\t" - "vlm %%v16,%%v23, 0(%[x_ptr]) \n\t" - "la %[x_ptr], 128(%[x_ptr]) \n\t" - "aghik %[n], %[n], -1 \n\t" - "jle 2f \n\t" - ".align 16 \n\t" - "1: \n\t" - "vfmdb %%v24, %%v16, %%v0 \n\t" - "vfmdb %%v25, %%v17, %%v0 \n\t" - "vfmdb %%v26, %%v18, %%v0 \n\t" - "vfmdb %%v27, %%v19, %%v1 \n\t" - "vlm %%v16,%%v19, 0(%[x_ptr]) \n\t" - "vfmdb %%v28, %%v20, %%v0 \n\t" - "vfmdb %%v29, %%v21, %%v1 \n\t" - "vfmdb %%v30, %%v22, %%v0 \n\t" - "vfmdb %%v31, %%v23, %%v1 \n\t" - "vlm %%v20,%%v23, 64(%[x_ptr]) \n\t" - "lay %[x_ptr], -128(%[x_ptr]) \n\t" - "vstm %%v24,%%v31, 0(%[x_ptr]) \n\t" - "la %[x_ptr],256(%[x_ptr]) \n\t" - "brctg %[n],1b \n\t" - "2: \n\t" - "vfmdb %%v24, %%v16, %%v0 \n\t" - "vfmdb %%v25, %%v17, %%v1 \n\t" - "vfmdb %%v26, %%v18, %%v0 \n\t" - "vfmdb %%v27, %%v19, %%v1 \n\t" - "lay %[x_ptr] , -128(%[x_ptr]) \n\t" - "vfmdb %%v28, %%v20, %%v0 \n\t" - "vfmdb %%v29, %%v21, %%v1 \n\t" - "vfmdb %%v30, %%v22, %%v0 \n\t" - "vfmdb %%v31, %%v23, %%v1 \n\t" - "vstm %%v24,%%v31, 0(%[x_ptr]) \n\t" - : [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x),[n] "+&r"(n) - : [alpha] "f"(da) - :"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21", - "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - } -#else -static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x ) +static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" - /* faster than sequence of triples(vl vfmd vst) (tested OPENBLAS_LOOPS=10000) */ - __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" - "lgdr %%r0,%[alpha] \n\t" - "vlvgp %%v0,%%r0,%%r0 \n\t" - "vlr %%v1,%%v0 \n\t" - "sllg %%r0,%[n],3 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%[x_ptr]) \n\t" - "vlm %%v16,%%v23, 0(%[x_ptr]) \n\t" - "vfmdb %%v16,%%v16,%%v0 \n\t" - "vfmdb %%v17,%%v17,%%v1 \n\t" - "vfmdb %%v18,%%v18,%%v0 \n\t" - "vfmdb %%v19,%%v19,%%v1 \n\t" - "vfmdb %%v20,%%v20,%%v0 \n\t" - "vfmdb %%v21,%%v21,%%v1 \n\t" - "vfmdb %%v22,%%v22,%%v0 \n\t" - "vfmdb %%v23,%%v23,%%v1 \n\t" - "vstm %%v16,%%v23, 0(%[x_ptr]) \n\t" - "vlm %%v24,%%v31,128(%[x_ptr]) \n\t" - "vfmdb %%v24,%%v24,%%v0 \n\t" - "vfmdb %%v25,%%v25,%%v1 \n\t" - "vfmdb %%v26,%%v26,%%v0 \n\t" - "vfmdb %%v27,%%v27,%%v1 \n\t" - "vfmdb %%v28,%%v28,%%v0 \n\t" - "vfmdb %%v29,%%v29,%%v1 \n\t" - "vfmdb %%v30,%%v30,%%v0 \n\t" - "vfmdb %%v31,%%v31,%%v1 \n\t" - "vstm %%v24,%%v31,128(%[x_ptr]) \n\t" - "la %[x_ptr], 256(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x) - : [n] "r"(n),[alpha] "f"(da) - :"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21", - "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - } -#endif -static void dscal_kernel_32_zero( BLASLONG n, FLOAT *x ) -{ - - __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" - "vzero %%v24 \n\t" - "sllg %%r0,%[n],3 \n\t" - "vzero %%v25 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%[x_ptr]) \n\t" - "vst %%v24, 0(%[x_ptr]) \n\t" - "vst %%v25, 16(%[x_ptr]) \n\t" - "vst %%v24, 32(%[x_ptr]) \n\t" - "vst %%v25, 48(%[x_ptr]) \n\t" - "vst %%v24, 64(%[x_ptr]) \n\t" - "vst %%v25, 80(%[x_ptr]) \n\t" - "vst %%v24, 96(%[x_ptr]) \n\t" - "vst %%v25, 112(%[x_ptr]) \n\t" - "vst %%v24, 128(%[x_ptr]) \n\t" - "vst %%v25, 144(%[x_ptr]) \n\t" - "vst %%v24, 160(%[x_ptr]) \n\t" - "vst %%v25, 176(%[x_ptr]) \n\t" - "vst %%v24, 192(%[x_ptr]) \n\t" - "vst %%v25, 208(%[x_ptr]) \n\t" - "vst %%v24, 224(%[x_ptr]) \n\t" - "vst %%v25, 240(%[x_ptr]) \n\t" - "la %[x_ptr],256(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "=m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x) - : [n] "r"(n) - :"cc" , "r0", "v24" ,"v25" - ); + "vst %%v24,0(%%r1,%1) \n\t" + "vst %%v25,16(%%r1,%1) \n\t" + "vst %%v26,32(%%r1,%1) \n\t" + "vst %%v27,48(%%r1,%1) \n\t" + "vst %%v24,64(%%r1,%1) \n\t" + "vst %%v25,80(%%r1,%1) \n\t" + "vst %%v26,96(%%r1,%1) \n\t" + "vst %%v27,112(%%r1,%1) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x) + :"memory","cc","r0","r1","v24","v25","v26","v27" + ); } - - - int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0,j=0; @@ -169,11 +109,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -16; if ( n1 > 0 ) { - dscal_kernel_32_zero(n1 , x); + dscal_kernel_16_zero(n1, x); j=n1; } @@ -188,10 +128,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -16; if ( n1 > 0 ) { - dscal_kernel_32(n1 , da , x); + dscal_kernel_16(n1, da, x); j=n1; } while(j < n) @@ -260,4 +200,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS } return 0; -} \ No newline at end of file +} + + diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c new file mode 100644 index 000000000..17461a029 --- /dev/null +++ b/kernel/zarch/dsdot.c @@ -0,0 +1,180 @@ +/*************************************************************************** +Copyright (c) 2013-2018,The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms,with or without +modification,are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice,this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice,this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL +DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static double dsdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + double dot; + + __asm__ volatile ( + "vzero %%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%3) \n\t" + "vfmsb %%v16,%%v16,%%v24 \n\t" + "vl %%v25,16(%%r1,%3) \n\t" + "vfmsb %%v17,%%v17,%%v25 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmsb %%v18,%%v18,%%v26 \n\t" + "vl %%v27,48(%%r1,%3) \n\t" + "vfmsb %%v19,%%v19,%%v27 \n\t" + "vl %%v28,64(%%r1,%3) \n\t" + "vfmsb %%v20,%%v20,%%v28 \n\t" + "vl %%v29,80(%%r1,%3) \n\t" + "vfmsb %%v21,%%v21,%%v29 \n\t" + "vl %%v30,96(%%r1,%3) \n\t" + "vfmsb %%v22,%%v22,%%v30 \n\t" + "vl %%v31,112(%%r1,%3) \n\t" + "vfmsb %%v23,%%v23,%%v31 \n\t" + + "vflls %%v24,%%v16 \n\t" + "vflls %%v25,%%v17 \n\t" + "vflls %%v26,%%v18 \n\t" + "vflls %%v27,%%v19 \n\t" + "vflls %%v28,%%v20 \n\t" + "vflls %%v29,%%v21 \n\t" + "vflls %%v30,%%v22 \n\t" + "vflls %%v31,%%v23 \n\t" + + "veslg %%v16,%%v16,32 \n\t" + "veslg %%v17,%%v17,32 \n\t" + "veslg %%v18,%%v18,32 \n\t" + "veslg %%v19,%%v19,32 \n\t" + "veslg %%v20,%%v20,32 \n\t" + "veslg %%v21,%%v21,32 \n\t" + "veslg %%v22,%%v22,32 \n\t" + "veslg %%v23,%%v23,32 \n\t" + + "vflls %%v16,%%v16 \n\t" + "vflls %%v17,%%v17 \n\t" + "vflls %%v18,%%v18 \n\t" + "vflls %%v19,%%v19 \n\t" + "vflls %%v20,%%v20 \n\t" + "vflls %%v21,%%v21 \n\t" + "vflls %%v22,%%v22 \n\t" + "vflls %%v23,%%v23 \n\t" + + "vfadb %%v16,%%v16,%%v24 \n\t" + "vfadb %%v17,%%v17,%%v25 \n\t" + "vfadb %%v18,%%v18,%%v26 \n\t" + "vfadb %%v19,%%v19,%%v27 \n\t" + "vfadb %%v20,%%v20,%%v28 \n\t" + "vfadb %%v21,%%v21,%%v29 \n\t" + "vfadb %%v22,%%v22,%%v30 \n\t" + "vfadb %%v23,%%v23,%%v31 \n\t" + "vfadb %%v16,%%v16,%%v20 \n\t" + "vfadb %%v17,%%v17,%%v21 \n\t" + "vfadb %%v18,%%v18,%%v22 \n\t" + "vfadb %%v19,%%v19,%%v23 \n\t" + "vfadb %%v16,%%v16,%%v18 \n\t" + "vfadb %%v17,%%v17,%%v19 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v0,%%v16,%%v0 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "ldr %0,%%f0 " + :"=f"(dot) + :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return dot; +} + +double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + double dot = 0.0 ; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -32; + + if ( n1 ) + dot = dsdot_kernel_32(n1,x,y); + + i = n1; + while(i < n) + { + + dot += y[i] * x[i] ; + i++ ; + + } + return(dot); + + + } + + BLASLONG n1 = n & -2; + + while(i < n1) + { + + dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; + ix += inc_x*2 ; + iy += inc_y*2 ; + i+=2 ; + + } + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index d7e079147..8070ef41a 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,217 +25,93 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - - #include "common.h" - - -#if defined(Z13_SWAP_A) -static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { - __asm__ volatile( - "pfd 1, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],5 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v16, 0(%%r1,%[ptr_x]) \n\t" + __asm__ volatile( + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 16(%%r1,%[ptr_x]) \n\t" + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vl %%v4, 64(%%r1,%2) \n\t" + "vl %%v5, 80(%%r1,%2) \n\t" + "vl %%v6, 96(%%r1,%2) \n\t" + "vl %%v7, 112(%%r1,%2) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v1, 16(%%r1,%1) \n\t" + "vst %%v2, 32(%%r1,%1) \n\t" + "vst %%v3, 48(%%r1,%1) \n\t" + "vst %%v4, 64(%%r1,%1) \n\t" + "vst %%v5, 80(%%r1,%1) \n\t" + "vst %%v6, 96(%%r1,%1) \n\t" + "vst %%v7, 112(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 32(%%r1,%[ptr_x]) \n\t" - - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 48(%%r1,%[ptr_x]) \n\t" - - "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_x]) \n\t" - - "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_x]) \n\t" - - "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_x]) \n\t" - - "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_x]) \n\t" - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v16, 128(%%r1,%[ptr_x]) \n\t" - - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 144(%%r1,%[ptr_x]) \n\t" - - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 160(%%r1,%[ptr_x]) \n\t" - - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 176(%%r1,%[ptr_x]) \n\t" - - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 192(%%r1,%[ptr_x]) \n\t" - - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 208(%%r1,%[ptr_x]) \n\t" - - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 224(%%r1,%[ptr_x]) \n\t" - - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 240(%%r1,%[ptr_x]) \n\t" - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[n])x), - [mem_y] "+m" (*(double (*)[n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23" - ,"v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; + "vl %%v0, 128(%%r1,%2) \n\t" + "vl %%v1, 144(%%r1,%2) \n\t" + "vl %%v2, 160(%%r1,%2) \n\t" + "vl %%v3, 176(%%r1,%2) \n\t" + "vl %%v4, 192(%%r1,%2) \n\t" + "vl %%v5, 208(%%r1,%2) \n\t" + "vl %%v6, 224(%%r1,%2) \n\t" + "vl %%v7, 240(%%r1,%2) \n\t" + "vst %%v0, 128(%%r1,%1) \n\t" + "vst %%v1, 144(%%r1,%1) \n\t" + "vst %%v2, 160(%%r1,%1) \n\t" + "vst %%v3, 176(%%r1,%1) \n\t" + "vst %%v4, 192(%%r1,%1) \n\t" + "vst %%v5, 208(%%r1,%1) \n\t" + "vst %%v6, 224(%%r1,%1) \n\t" + "vst %%v7, 240(%%r1,%1) \n\t" + "vst %%v16, 0(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else - -static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "pfd 2, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],5 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v16, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - - - "vl %%v0, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v1, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v2, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v3, 48(%%r1,%[ptr_y]) \n\t" - "vl %%v4, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v5, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v6, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v7, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v0, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v1, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v2, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v3, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v4, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v5, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v6, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v7, 112(%%r1,%[ptr_x]) \n\t" - - "vl %%v0, 128(%%r1,%[ptr_y]) \n\t" - "vl %%v1, 144(%%r1,%[ptr_y]) \n\t" - "vl %%v2, 160(%%r1,%[ptr_y]) \n\t" - "vl %%v3, 176(%%r1,%[ptr_y]) \n\t" - "vl %%v4, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v5, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v6, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v7, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v0, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v1, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v2, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v3, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v4, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v5, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v6, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v7, 240(%%r1,%[ptr_x]) \n\t" - - "vst %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[n])x), - [mem_y] "+m" (*(double (*)[n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - -} - -#endif - int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; @@ -284,5 +160,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, } - - diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c new file mode 100644 index 000000000..e7f096e0d --- /dev/null +++ b/kernel/zarch/icamax.c @@ -0,0 +1,319 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) +{ + BLASLONG iamax; + + __asm__ volatile ( + "vlef %%v0,0(%3),0 \n\t" + "vlef %%v1,4(%3),0 \n\t" + "vlef %%v0,8(%3),1 \n\t" + "vlef %%v1,12(%3),1 \n\t" + "vlef %%v0,16(%3),2 \n\t" + "vlef %%v1,20(%3),2 \n\t" + "vlef %%v0,24(%3),3 \n\t" + "vlef %%v1,28(%3),3 \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vflpsb %%v1,%%v1 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,16 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%3) \n\t" + + "vlef %%v16,0(%%r1,%3),0 \n\t" + "vlef %%v17,4(%%r1,%3),0 \n\t" + "vlef %%v16,8(%%r1,%3),1 \n\t" + "vlef %%v17,12(%%r1,%3),1 \n\t" + "vlef %%v16,16(%%r1,%3),2 \n\t" + "vlef %%v17,20(%%r1,%3),2 \n\t" + "vlef %%v16,24(%%r1,%3),3 \n\t" + "vlef %%v17,28(%%r1,%3),3 \n\t" + + "vlef %%v18,32(%%r1,%3),0 \n\t" + "vlef %%v19,36(%%r1,%3),0 \n\t" + "vlef %%v18,40(%%r1,%3),1 \n\t" + "vlef %%v19,44(%%r1,%3),1 \n\t" + "vlef %%v18,48(%%r1,%3),2 \n\t" + "vlef %%v19,52(%%r1,%3),2 \n\t" + "vlef %%v18,56(%%r1,%3),3 \n\t" + "vlef %%v19,30(%%r1,%3),3 \n\t" + + "vlef %%v20,64(%%r1,%3),0 \n\t" + "vlef %%v21,68(%%r1,%3),0 \n\t" + "vlef %%v20,72(%%r1,%3),1 \n\t" + "vlef %%v21,76(%%r1,%3),1 \n\t" + "vlef %%v20,80(%%r1,%3),2 \n\t" + "vlef %%v21,84(%%r1,%3),2 \n\t" + "vlef %%v20,88(%%r1,%3),3 \n\t" + "vlef %%v21,92(%%r1,%3),3 \n\t" + + "vlef %%v22,96(%%r1,%3),0 \n\t" + "vlef %%v23,100(%%r1,%3),0 \n\t" + "vlef %%v22,104(%%r1,%3),1 \n\t" + "vlef %%v23,108(%%r1,%3),1 \n\t" + "vlef %%v22,112(%%r1,%3),2 \n\t" + "vlef %%v23,116(%%r1,%3),2 \n\t" + "vlef %%v22,120(%%r1,%3),3 \n\t" + "vlef %%v23,124(%%r1,%3),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vlef %%v16,128(%%r1,%3),0 \n\t" + "vlef %%v17,132(%%r1,%3),0 \n\t" + "vlef %%v16,136(%%r1,%3),1 \n\t" + "vlef %%v17,140(%%r1,%3),1 \n\t" + "vlef %%v16,144(%%r1,%3),2 \n\t" + "vlef %%v17,148(%%r1,%3),2 \n\t" + "vlef %%v16,152(%%r1,%3),3 \n\t" + "vlef %%v17,156(%%r1,%3),3 \n\t" + + "vlef %%v18,160(%%r1,%3),0 \n\t" + "vlef %%v19,164(%%r1,%3),0 \n\t" + "vlef %%v18,168(%%r1,%3),1 \n\t" + "vlef %%v19,172(%%r1,%3),1 \n\t" + "vlef %%v18,176(%%r1,%3),2 \n\t" + "vlef %%v19,180(%%r1,%3),2 \n\t" + "vlef %%v18,184(%%r1,%3),3 \n\t" + "vlef %%v19,188(%%r1,%3),3 \n\t" + + "vlef %%v20,192(%%r1,%3),0 \n\t" + "vlef %%v21,196(%%r1,%3),0 \n\t" + "vlef %%v20,200(%%r1,%3),1 \n\t" + "vlef %%v21,204(%%r1,%3),1 \n\t" + "vlef %%v20,208(%%r1,%3),2 \n\t" + "vlef %%v21,212(%%r1,%3),2 \n\t" + "vlef %%v20,216(%%r1,%3),3 \n\t" + "vlef %%v21,220(%%r1,%3),3 \n\t" + + "vlef %%v22,224(%%r1,%3),0 \n\t" + "vlef %%v23,228(%%r1,%3),0 \n\t" + "vlef %%v22,232(%%r1,%3),1 \n\t" + "vlef %%v23,236(%%r1,%3),1 \n\t" + "vlef %%v22,240(%%r1,%3),2 \n\t" + "vlef %%v23,244(%%r1,%3),2 \n\t" + "vlef %%v22,248(%%r1,%3),3 \n\t" + "vlef %%v23,252(%%r1,%3),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v0,%%v3 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamax),"=m"(*amax) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return iamax; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + max = icamax_kernel_32(n1, x, &maxf); + + i = n1; + } + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (max + 1); + + } else { + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (max + 1); + } +} + + diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c new file mode 100644 index 000000000..b9c1ccd9c --- /dev/null +++ b/kernel/zarch/icamin.c @@ -0,0 +1,319 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) +{ + BLASLONG iamin; + + __asm__ volatile ( + "vlef %%v0,0(%3),0 \n\t" + "vlef %%v1,4(%3),0 \n\t" + "vlef %%v0,8(%3),1 \n\t" + "vlef %%v1,12(%3),1 \n\t" + "vlef %%v0,16(%3),2 \n\t" + "vlef %%v1,20(%3),2 \n\t" + "vlef %%v0,24(%3),3 \n\t" + "vlef %%v1,28(%3),3 \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vflpsb %%v1,%%v1 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,16 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vlef %%v16,0(%%r1,%3),0 \n\t" + "vlef %%v17,4(%%r1,%3),0 \n\t" + "vlef %%v16,8(%%r1,%3),1 \n\t" + "vlef %%v17,12(%%r1,%3),1 \n\t" + "vlef %%v16,16(%%r1,%3),2 \n\t" + "vlef %%v17,20(%%r1,%3),2 \n\t" + "vlef %%v16,24(%%r1,%3),3 \n\t" + "vlef %%v17,28(%%r1,%3),3 \n\t" + + "vlef %%v18,32(%%r1,%3),0 \n\t" + "vlef %%v19,36(%%r1,%3),0 \n\t" + "vlef %%v18,40(%%r1,%3),1 \n\t" + "vlef %%v19,44(%%r1,%3),1 \n\t" + "vlef %%v18,48(%%r1,%3),2 \n\t" + "vlef %%v19,52(%%r1,%3),2 \n\t" + "vlef %%v18,56(%%r1,%3),3 \n\t" + "vlef %%v19,30(%%r1,%3),3 \n\t" + + "vlef %%v20,64(%%r1,%3),0 \n\t" + "vlef %%v21,68(%%r1,%3),0 \n\t" + "vlef %%v20,72(%%r1,%3),1 \n\t" + "vlef %%v21,76(%%r1,%3),1 \n\t" + "vlef %%v20,80(%%r1,%3),2 \n\t" + "vlef %%v21,84(%%r1,%3),2 \n\t" + "vlef %%v20,88(%%r1,%3),3 \n\t" + "vlef %%v21,92(%%r1,%3),3 \n\t" + + "vlef %%v22,96(%%r1,%3),0 \n\t" + "vlef %%v23,100(%%r1,%3),0 \n\t" + "vlef %%v22,104(%%r1,%3),1 \n\t" + "vlef %%v23,108(%%r1,%3),1 \n\t" + "vlef %%v22,112(%%r1,%3),2 \n\t" + "vlef %%v23,116(%%r1,%3),2 \n\t" + "vlef %%v22,120(%%r1,%3),3 \n\t" + "vlef %%v23,124(%%r1,%3),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vlef %%v16,128(%%r1,%3),0 \n\t" + "vlef %%v17,132(%%r1,%3),0 \n\t" + "vlef %%v16,136(%%r1,%3),1 \n\t" + "vlef %%v17,140(%%r1,%3),1 \n\t" + "vlef %%v16,144(%%r1,%3),2 \n\t" + "vlef %%v17,148(%%r1,%3),2 \n\t" + "vlef %%v16,152(%%r1,%3),3 \n\t" + "vlef %%v17,156(%%r1,%3),3 \n\t" + + "vlef %%v18,160(%%r1,%3),0 \n\t" + "vlef %%v19,164(%%r1,%3),0 \n\t" + "vlef %%v18,168(%%r1,%3),1 \n\t" + "vlef %%v19,172(%%r1,%3),1 \n\t" + "vlef %%v18,176(%%r1,%3),2 \n\t" + "vlef %%v19,180(%%r1,%3),2 \n\t" + "vlef %%v18,184(%%r1,%3),3 \n\t" + "vlef %%v19,188(%%r1,%3),3 \n\t" + + "vlef %%v20,192(%%r1,%3),0 \n\t" + "vlef %%v21,196(%%r1,%3),0 \n\t" + "vlef %%v20,200(%%r1,%3),1 \n\t" + "vlef %%v21,204(%%r1,%3),1 \n\t" + "vlef %%v20,208(%%r1,%3),2 \n\t" + "vlef %%v21,212(%%r1,%3),2 \n\t" + "vlef %%v20,216(%%r1,%3),3 \n\t" + "vlef %%v21,220(%%r1,%3),3 \n\t" + + "vlef %%v22,224(%%r1,%3),0 \n\t" + "vlef %%v23,228(%%r1,%3),0 \n\t" + "vlef %%v22,232(%%r1,%3),1 \n\t" + "vlef %%v23,236(%%r1,%3),1 \n\t" + "vlef %%v22,240(%%r1,%3),2 \n\t" + "vlef %%v23,244(%%r1,%3),2 \n\t" + "vlef %%v22,248(%%r1,%3),3 \n\t" + "vlef %%v23,252(%%r1,%3),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v3,%%v0 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamin),"=m"(*amin) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return iamin; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0; + BLASLONG min = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(min); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + min = icamin_kernel_32(n1, x, &minf); + + i = n1; + } + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (min + 1); + + } else { + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (min + 1); + } +} + + diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index b67091148..aba880949 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -23,164 +23,173 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ + #include "common.h" #include #if defined(DOUBLE) - #define ABS fabs - #else - #define ABS fabsf - #endif - -/** - * Find maximum index - * Warning: requirements n>0 and n % 32 == 0 - * @param n - * @param x pointer to the vector - * @param maxf (out) maximum absolute value .( only for output ) - * @return index - */ -static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { - BLASLONG index; - __asm__( - "pfd 1, 0(%[ptr_x]) \n\t" - "sllg %%r0,%[n],3 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vleig %%v20,0,0 \n\t" - "vleig %%v20,1,1 \n\t" - "vleig %%v21,2,0 \n\t" - "vleig %%v21,3,1 \n\t" - "vleig %%v22,4,0 \n\t" - "vleig %%v22,5,1 \n\t" - "vleig %%v23,6,0 \n\t" - "vleig %%v23,7,1 \n\t" - "vrepig %%v4,8 \n\t" - "vzero %%v5 \n\t" - "vzero %%v18 \n\t" - "vzero %%v19 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" - "vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - "vfchdb %%v16,%%v25,%%v24 \n\t " - "vfchdb %%v17,%%v27,%%v26 \n\t " - "vsel %%v1,%%v21,%%v20,%%v16 \n\t" - "vsel %%v0,%%v25,%%v24,%%v16 \n\t" - "vsel %%v2,%%v23,%%v22,%%v17 \n\t" - "vsel %%v3,%%v27,%%v26,%%v17 \n\t" - "vfchdb %%v16,%%v29,%%v28 \n\t " - "vfchdb %%v17,%%v31,%%v30 \n\t" - "vsel %%v24,%%v21,%%v20,%%v16 \n\t" - "vsel %%v25,%%v29,%%v28,%%v16 \n\t" - "vsel %%v26,%%v23,%%v22,%%v17 \n\t" - "vsel %%v27,%%v31,%%v30,%%v17 \n\t" +static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) +{ + BLASLONG iamax; - "vfchdb %%v28, %%v3,%%v0 \n\t" - "vfchdb %%v29,%%v27, %%v25 \n\t" - "vsel %%v1,%%v2,%%v1,%%v28 \n\t" - "vsel %%v0,%%v3,%%v0,%%v28 \n\t" - "vsel %%v24,%%v26,%%v24,%%v29 \n\t" - "vsel %%v25,%%v27,%%v25,%%v29 \n\t" - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "vag %%v24,%%v24,%%v4 \n\t" - "vfchdb %%v16,%%v25 , %%v0 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" - "vfchdb %%v17, %%v29,%%v18 \n\t" - "vsel %%v19,%%v28,%%v19,%%v17 \n\t" - "vsel %%v18,%%v29,%%v18,%%v17 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - "vfchdb %%v16,%%v25,%%v24 \n\t " - "vfchdb %%v17,%%v27,%%v26 \n\t " - "vsel %%v1,%%v21,%%v20,%%v16 \n\t" - "vsel %%v0,%%v25,%%v24,%%v16 \n\t" - "vsel %%v2,%%v23,%%v22,%%v17 \n\t" - "vsel %%v3,%%v27,%%v26,%%v17 \n\t" - "vfchdb %%v16,%%v29,%%v28 \n\t " - "vfchdb %%v17,%%v31,%%v30 \n\t" - "vsel %%v24,%%v21,%%v20,%%v16 \n\t" - "vsel %%v25,%%v29,%%v28,%%v16 \n\t" - "vsel %%v26,%%v23,%%v22,%%v17 \n\t" - "vsel %%v27,%%v31,%%v30,%%v17 \n\t" + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,16 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "vleig %%v28,8,0 \n\t" + "vleig %%v28,9,1 \n\t" + "vleig %%v29,10,0 \n\t" + "vleig %%v29,11,1 \n\t" + "vleig %%v30,12,0 \n\t" + "vleig %%v30,13,1 \n\t" + "vleig %%v31,14,0 \n\t" + "vleig %%v31,15,1 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" - "vfchdb %%v28, %%v3,%%v0 \n\t" - "vfchdb %%v29,%%v27, %%v25 \n\t" - "vsel %%v1,%%v2,%%v1,%%v28 \n\t" - "vsel %%v0,%%v3,%%v0,%%v28 \n\t" - "vsel %%v24,%%v26,%%v24,%%v29 \n\t" - "vsel %%v25,%%v27,%%v25,%%v29 \n\t" - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - "vag %%v24,%%v24,%%v4 \n\t" - "vfchdb %%v16,%%v25 , %%v0 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" - "vfchdb %%v17, %%v29,%%v18 \n\t" - "vsel %%v19,%%v28,%%v19,%%v17 \n\t" - "vsel %%v18,%%v29,%%v18,%%v17 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "clgrjl %[ptr_tmp],%%r0,1b \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchdb %%v6,%%v20,%%v21 \n\t" + "vfchdb %%v7,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vrepg %%v26,%%v18,1 \n\t" - "vrepg %%v5,%%v19,1 \n\t" - "wfcdb %%v26,%%v18 \n\t" - "jne 2f \n\t" - "vsteg %%v18,%[maxf],0 \n\t" - "vmnlg %%v1,%%v5,%%v19 \n\t" - "j 3f \n\t" + "vfchdb %%v20,%%v16,%%v17 \n\t" + "vfchdb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "2: \n\t" - "wfchdb %%v16,%%v26,%%v18 \n\t" - "vsel %%v1,%%v5,%%v19,%%v16 \n\t" - "vsel %%v0,%%v26,%%v18,%%v16 \n\t" - "std %%f0,%[maxf] \n\t" - - "3: \n\t" - "vlgvg %[index],%%v1,0 \n\t" - : [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x) - : "cc", "r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return index; + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchdb %%v6,%%v20,%%v21 \n\t" + "vfchdb %%v7,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v16,%%v17 \n\t" + "vfchdb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamax),"=m"(*amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return iamax; } - - - BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; BLASLONG j = 0; - BLASLONG ix = 0; FLOAT maxf = 0.0; BLASLONG max = 0; @@ -191,7 +200,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG n1 = n & -32; if (n1 > 0) { - max = diamax_kernel_32_TUNED(n1, x, &maxf); + max = idamax_kernel_32(n1, x, &maxf); i = n1; } diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 8a7ff1659..3213efa4d 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -23,192 +23,185 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ + #include "common.h" #include #if defined(DOUBLE) - #define ABS fabs - #else - #define ABS fabsf - #endif -/** - * Find minimum index - * Warning: requirements n>0 and n % 32 == 0 - * @param n - * @param x pointer to the vector - * @param minf (out) minimum absolute value .( only for output ) - * @return minimum index - */ -static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { - BLASLONG index; - __asm__( - "pfd 1, 0(%[ptr_x]) \n\t" - "sllg %%r0,%[n],3 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vleig %%v20,0,0 \n\t" - "vleig %%v20,1,1 \n\t" - "vleig %%v21,2,0 \n\t" - "vleig %%v21,3,1 \n\t" - "vleig %%v22,4,0 \n\t" - "vleig %%v22,5,1 \n\t" - "vleig %%v23,6,0 \n\t" - "vleig %%v23,7,1 \n\t" - "vrepig %%v4,8 \n\t" - "vlrepg %%v18,0(%[ptr_x]) \n\t" - "vzero %%v5 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vzero %%v19 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" - "vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t" +static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) +{ + BLASLONG iamin; - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,16 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "vleig %%v28,8,0 \n\t" + "vleig %%v28,9,1 \n\t" + "vleig %%v29,10,0 \n\t" + "vleig %%v29,11,1 \n\t" + "vleig %%v30,12,0 \n\t" + "vleig %%v30,13,1 \n\t" + "vleig %%v31,14,0 \n\t" + "vleig %%v31,15,1 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" - "vfchdb %%v16,%%v24,%%v25 \n\t " - "vfchdb %%v17,%%v26 ,%%v27 \n\t " - "vsel %%v1,%%v21,%%v20,%%v16 \n\t" - "vsel %%v0,%%v25,%%v24,%%v16 \n\t" - "vsel %%v2,%%v23,%%v22,%%v17 \n\t" - "vsel %%v3,%%v27,%%v26,%%v17 \n\t" - "vfchdb %%v16,%%v28, %%v29 \n\t " - "vfchdb %%v17,%%v30,%%v31 \n\t" - "vsel %%v24,%%v21,%%v20,%%v16 \n\t" - "vsel %%v25,%%v29,%%v28,%%v16 \n\t" - "vsel %%v26,%%v23,%%v22,%%v17 \n\t" - "vsel %%v27,%%v31,%%v30,%%v17 \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchdb %%v6,%%v21,%%v20 \n\t" + "vfchdb %%v7,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + "vfchdb %%v20,%%v17,%%v16 \n\t" + "vfchdb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v28,%%v0 , %%v3 \n\t" - "vfchdb %%v29, %%v25,%%v27 \n\t" - "vsel %%v1,%%v2,%%v1,%%v28 \n\t" - "vsel %%v0,%%v3,%%v0,%%v28 \n\t" - "vsel %%v24,%%v26,%%v24,%%v29 \n\t" - "vsel %%v25,%%v27,%%v25,%%v29 \n\t" + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "vag %%v24,%%v24,%%v4 \n\t" + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" - "vfchdb %%v16, %%v0,%%v25 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" - "vfchdb %%v17,%%v18, %%v29 \n\t" - "vsel %%v19,%%v28,%%v19,%%v17 \n\t" - "vsel %%v18,%%v29,%%v18,%%v17 \n\t" + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchdb %%v6,%%v21,%%v20 \n\t" + "vfchdb %%v7,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" + "vfchdb %%v20,%%v17,%%v16 \n\t" + "vfchdb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v16,%%v24,%%v25 \n\t" - "vfchdb %%v17,%%v26 ,%%v27 \n\t" - "vsel %%v1,%%v21,%%v20,%%v16 \n\t" - "vsel %%v0,%%v25,%%v24,%%v16 \n\t" - "vsel %%v2,%%v23,%%v22,%%v17 \n\t" - "vsel %%v3,%%v27,%%v26,%%v17 \n\t" - "vfchdb %%v16,%%v28 ,%%v29 \n\t" - "vfchdb %%v17,%%v30,%%v31 \n\t" - "vsel %%v24,%%v21,%%v20,%%v16 \n\t" - "vsel %%v25,%%v29,%%v28,%%v16 \n\t" - "vsel %%v26,%%v23,%%v22,%%v17 \n\t" - "vsel %%v27,%%v31,%%v30,%%v17 \n\t" + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" - "vfchdb %%v28,%%v0 , %%v3 \n\t" - "vfchdb %%v29, %%v25,%%v27 \n\t" - "vsel %%v1,%%v2,%%v1,%%v28 \n\t" - "vsel %%v0,%%v3,%%v0,%%v28 \n\t" - "vsel %%v24,%%v26,%%v24,%%v29 \n\t" - "vsel %%v25,%%v27,%%v25,%%v29 \n\t" - - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - "vag %%v24,%%v24,%%v4 \n\t" - - "vfchdb %%v16, %%v0,%%v25 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" - - "vfchdb %%v17,%%v18, %%v29 \n\t" - "vsel %%v19,%%v28,%%v19,%%v17 \n\t" - "vsel %%v18,%%v29,%%v18,%%v17 \n\t" - - "vag %%v5,%%v5,%%v4 \n\t" - - "clgrjl %[ptr_tmp],%%r0,1b \n\t" - - - "vrepg %%v26,%%v18,1 \n\t" - "vrepg %%v5,%%v19,1 \n\t" - "wfcdb %%v26,%%v18 \n\t" - "jne 2f \n\t" - "vsteg %%v18,%[minf],0 \n\t" - "vmnlg %%v1,%%v5,%%v19 \n\t" - "j 3f \n\t" - - "2: \n\t" - "wfchdb %%v16,%%v18 ,%%v26 \n\t " - "vsel %%v1,%%v5,%%v19,%%v16 \n\t" - "vsel %%v0,%%v26,%%v18,%%v16 \n\t" - "std %%f0,%[minf] \n\t" - - "3: \n\t" - "vlgvg %[index],%%v1,0 \n\t" - - : [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x) - : "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - - ); - - return index; + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamin),"=m"(*amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + return iamin; } - - - + BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; BLASLONG j = 0; - BLASLONG ix = 0; - BLASLONG min = 0; FLOAT minf = 0.0; - + BLASLONG min = 0; + if (n <= 0 || inc_x <= 0) return (min); - minf = ABS(x[0]); //index's not incremented,though it will make first comparision redundant + if (inc_x == 1) { BLASLONG n1 = n & -32; if (n1 > 0) { - min = diamin_kernel_32(n1, x, &minf); + min = idamin_kernel_32(n1, x, &minf); + i = n1; } diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c new file mode 100644 index 000000000..26fff4eb0 --- /dev/null +++ b/kernel/zarch/idmax.c @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) +{ + BLASLONG imax; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,16 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "vleig %%v28,8,0 \n\t" + "vleig %%v28,9,1 \n\t" + "vleig %%v29,10,0 \n\t" + "vleig %%v29,11,1 \n\t" + "vleig %%v30,12,0 \n\t" + "vleig %%v30,13,1 \n\t" + "vleig %%v31,14,0 \n\t" + "vleig %%v31,15,1 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchdb %%v6,%%v20,%%v21 \n\t" + "vfchdb %%v7,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v16,%%v17 \n\t" + "vfchdb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchdb %%v6,%%v20,%%v21 \n\t" + "vfchdb %%v7,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v16,%%v17 \n\t" + "vfchdb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(imax),"=m"(*max) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return imax; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + max = idmax_kernel_32(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (x[i] > maxf) { + max = i; + maxf = x[i]; + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + max = j + 1; + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + max = j + 2; + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + max = j + 3; + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c new file mode 100644 index 000000000..570b33a15 --- /dev/null +++ b/kernel/zarch/idmin.c @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) +{ + BLASLONG imin; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,16 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "vleig %%v28,8,0 \n\t" + "vleig %%v28,9,1 \n\t" + "vleig %%v29,10,0 \n\t" + "vleig %%v29,11,1 \n\t" + "vleig %%v30,12,0 \n\t" + "vleig %%v30,13,1 \n\t" + "vleig %%v31,14,0 \n\t" + "vleig %%v31,15,1 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchdb %%v6,%%v21,%%v20 \n\t" + "vfchdb %%v7,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v17,%%v16 \n\t" + "vfchdb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchdb %%v6,%%v21,%%v20 \n\t" + "vfchdb %%v7,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v17,%%v16 \n\t" + "vfchdb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(imin),"=m"(*min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return imin; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; + + if (n <= 0 || inc_x <= 0) return (min); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + min = idmin_kernel_32(n1, x, &minf); + + i = n1; + } + + while (i < n) { + if (x[i] < minf) { + min = i; + minf = x[i]; + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] < minf) { + min = j; + minf = x[i]; + } + if (x[i + inc_x] < minf) { + min = j + 1; + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + min = j + 2; + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + min = j + 3; + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + min = j; + minf = x[i]; + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c new file mode 100644 index 000000000..95a665b10 --- /dev/null +++ b/kernel/zarch/isamax.c @@ -0,0 +1,299 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) +{ + BLASLONG iamax; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,32 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "vleif %%v28,16,0 \n\t" + "vleif %%v28,17,1 \n\t" + "vleif %%v28,18,2 \n\t" + "vleif %%v28,19,3 \n\t" + "vleif %%v29,20,0 \n\t" + "vleif %%v29,21,1 \n\t" + "vleif %%v29,22,2 \n\t" + "vleif %%v29,23,3 \n\t" + "vleif %%v30,24,0 \n\t" + "vleif %%v30,25,1 \n\t" + "vleif %%v30,26,2 \n\t" + "vleif %%v30,27,3 \n\t" + "vleif %%v31,28,0 \n\t" + "vleif %%v31,29,1 \n\t" + "vleif %%v31,30,2 \n\t" + "vleif %%v31,31,3 \n\t" + "srlg %%r0,%2,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchsb %%v7,%%v20,%%v21 \n\t" + "vfchsb %%v8,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v16,%%v17 \n\t" + "vfchsb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchsb %%v7,%%v20,%%v21 \n\t" + "vfchsb %%v8,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v16,%%v17 \n\t" + "vfchsb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v0,%%v3 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamax),"=m"(*amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return iamax; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + max = isamax_kernel_64(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c new file mode 100644 index 000000000..640fc02c9 --- /dev/null +++ b/kernel/zarch/isamin.c @@ -0,0 +1,299 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) +{ + BLASLONG iamin; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,32 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "vleif %%v28,16,0 \n\t" + "vleif %%v28,17,1 \n\t" + "vleif %%v28,18,2 \n\t" + "vleif %%v28,19,3 \n\t" + "vleif %%v29,20,0 \n\t" + "vleif %%v29,21,1 \n\t" + "vleif %%v29,22,2 \n\t" + "vleif %%v29,23,3 \n\t" + "vleif %%v30,24,0 \n\t" + "vleif %%v30,25,1 \n\t" + "vleif %%v30,26,2 \n\t" + "vleif %%v30,27,3 \n\t" + "vleif %%v31,28,0 \n\t" + "vleif %%v31,29,1 \n\t" + "vleif %%v31,30,2 \n\t" + "vleif %%v31,31,3 \n\t" + "srlg %%r0,%2,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchsb %%v7,%%v21,%%v20 \n\t" + "vfchsb %%v8,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v17,%%v16 \n\t" + "vfchsb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchsb %%v7,%%v21,%%v20 \n\t" + "vfchsb %%v8,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v17,%%v16 \n\t" + "vfchsb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v3,%%v0 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamin),"=m"(*amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return iamin; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; + + if (n <= 0 || inc_x <= 0) return (min); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + min = isamin_kernel_64(n1, x, &minf); + + i = n1; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c new file mode 100644 index 000000000..0eb350315 --- /dev/null +++ b/kernel/zarch/ismax.c @@ -0,0 +1,275 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) +{ + BLASLONG imax; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,32 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "vleif %%v28,16,0 \n\t" + "vleif %%v28,17,1 \n\t" + "vleif %%v28,18,2 \n\t" + "vleif %%v28,19,3 \n\t" + "vleif %%v29,20,0 \n\t" + "vleif %%v29,21,1 \n\t" + "vleif %%v29,22,2 \n\t" + "vleif %%v29,23,3 \n\t" + "vleif %%v30,24,0 \n\t" + "vleif %%v30,25,1 \n\t" + "vleif %%v30,26,2 \n\t" + "vleif %%v30,27,3 \n\t" + "vleif %%v31,28,0 \n\t" + "vleif %%v31,29,1 \n\t" + "vleif %%v31,30,2 \n\t" + "vleif %%v31,31,3 \n\t" + "srlg %%r0,%2,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchsb %%v7,%%v20,%%v21 \n\t" + "vfchsb %%v8,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v16,%%v17 \n\t" + "vfchsb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchsb %%v7,%%v20,%%v21 \n\t" + "vfchsb %%v8,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v16,%%v17 \n\t" + "vfchsb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v0,%%v3 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(imax),"=m"(*max) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return imax; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + max = ismax_kernel_64(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (x[i] > maxf) { + max = i; + maxf = x[i]; + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + max = j + 1; + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + max = j + 2; + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + max = j + 3; + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c new file mode 100644 index 000000000..f050db8cb --- /dev/null +++ b/kernel/zarch/ismin.c @@ -0,0 +1,275 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) +{ + BLASLONG imin; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,32 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "vleif %%v28,16,0 \n\t" + "vleif %%v28,17,1 \n\t" + "vleif %%v28,18,2 \n\t" + "vleif %%v28,19,3 \n\t" + "vleif %%v29,20,0 \n\t" + "vleif %%v29,21,1 \n\t" + "vleif %%v29,22,2 \n\t" + "vleif %%v29,23,3 \n\t" + "vleif %%v30,24,0 \n\t" + "vleif %%v30,25,1 \n\t" + "vleif %%v30,26,2 \n\t" + "vleif %%v30,27,3 \n\t" + "vleif %%v31,28,0 \n\t" + "vleif %%v31,29,1 \n\t" + "vleif %%v31,30,2 \n\t" + "vleif %%v31,31,3 \n\t" + "srlg %%r0,%2,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchsb %%v7,%%v21,%%v20 \n\t" + "vfchsb %%v8,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v17,%%v16 \n\t" + "vfchsb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchsb %%v7,%%v21,%%v20 \n\t" + "vfchsb %%v8,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v17,%%v16 \n\t" + "vfchsb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v3,%%v0 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(imin),"=m"(*min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return imin; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; + + if (n <= 0 || inc_x <= 0) return (min); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + min = ismin_kernel_64(n1, x, &minf); + + i = n1; + } + + while (i < n) { + if (x[i] < minf) { + min = i; + minf = x[i]; + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] < minf) { + min = j; + minf = x[i]; + } + if (x[i + inc_x] < minf) { + min = j + 1; + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + min = j + 2; + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + min = j + 3; + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + min = j; + minf = x[i]; + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 216c3414a..bf5f621a7 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -24,190 +24,165 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" #include -#define ABS fabs -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) +static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) +{ + BLASLONG iamax; + __asm__ volatile ( + "vleg %%v0,0(%3),0 \n\t" + "vleg %%v1,8(%3),0 \n\t" + "vleg %%v0,16(%3),1 \n\t" + "vleg %%v1,24(%3),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v1,%%v1 \n\t" + "vfadb %%v0,%%v0,%%v1 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,8 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "srlg %%r0,%2,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" - -/** - * Find maximum index - * Warning: requirements n>0 and n % 16 == 0 - * @param n - * @param x pointer to the vector - * @param maxf (out) maximum absolute value .( only for output ) - * @return index - */ -static BLASLONG ziamax_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { - BLASLONG index; - __asm__( - "pfd 1, 0(%[ptr_x]) \n\t" - "vleig %%v16,0,0 \n\t" - "vleig %%v16,1,1 \n\t" - "vleig %%v17,2,0 \n\t" - "vleig %%v17,3,1 \n\t" - "vleig %%v18,4,0 \n\t" - "vleig %%v18,5,1 \n\t" - "vleig %%v19,6,0 \n\t" - "vleig %%v19,7,1 \n\t" - "vleig %%v20,8,0 \n\t" - "vleig %%v20,9,1 \n\t" - "vleig %%v21,10,0 \n\t" - "vleig %%v21,11,1 \n\t" - "vleig %%v22,12,0 \n\t" - "vleig %%v22,13,1 \n\t" - "vleig %%v23,14,0 \n\t" - "vleig %%v23,15,1 \n\t" - - - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vzero %%v6 \n\t" - "vzero %%v7 \n\t" - "vrepig %%v4,16 \n\t" - "vzero %%v5 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" + "vleg %%v16,0(%%r1,%3),0 \n\t" + "vleg %%v17,8(%%r1,%3),0 \n\t" + "vleg %%v16,16(%%r1,%3),1 \n\t" + "vleg %%v17,24(%%r1,%3),1 \n\t" + "vleg %%v18,32(%%r1,%3),0 \n\t" + "vleg %%v19,40(%%r1,%3),0 \n\t" + "vleg %%v18,48(%%r1,%3),1 \n\t" + "vleg %%v19,56(%%r1,%3),1 \n\t" + "vleg %%v20,64(%%r1,%3),0 \n\t" + "vleg %%v21,72(%%r1,%3),0 \n\t" + "vleg %%v20,80(%%r1,%3),1 \n\t" + "vleg %%v21,88(%%r1,%3),1 \n\t" + "vleg %%v22,96(%%r1,%3),0 \n\t" + "vleg %%v23,104(%%r1,%3),0 \n\t" + "vleg %%v22,112(%%r1,%3),1 \n\t" + "vleg %%v23,120(%%r1,%3),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" - "vleg %%v24 , 0(%[ptr_tmp]),0 \n\t" - "vleg %%v25 , 8(%[ptr_tmp]),0 \n\t" - "vleg %%v24 , 16(%[ptr_tmp]),1 \n\t" - "vleg %%v25 , 24(%[ptr_tmp]),1 \n\t" - "vleg %%v26 , 32(%[ptr_tmp]),0 \n\t" - "vleg %%v27 , 40(%[ptr_tmp]),0 \n\t" - "vleg %%v26 , 48(%[ptr_tmp]),1 \n\t" - "vleg %%v27 , 56(%[ptr_tmp]),1 \n\t" - "vleg %%v28 , 64(%[ptr_tmp]),0 \n\t" - "vleg %%v29 , 72(%[ptr_tmp]),0 \n\t" - "vleg %%v28 , 80(%[ptr_tmp]),1 \n\t" - "vleg %%v29 , 88(%[ptr_tmp]),1 \n\t" - "vleg %%v30 , 96(%[ptr_tmp]),0 \n\t" - "vleg %%v31 ,104(%[ptr_tmp]),0 \n\t" - "vleg %%v30 ,112(%[ptr_tmp]),1 \n\t" - "vleg %%v31 ,120(%[ptr_tmp]),1 \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v0,%%v24,%%v25 \n\t" - "vfadb %%v1,%%v26,%%v27 \n\t" - "vfadb %%v2,%%v28,%%v29 \n\t" - "vfadb %%v3,%%v30,%%v31 \n\t" - - - "vleg %%v24 , 128(%[ptr_tmp]),0 \n\t" - "vleg %%v25 , 136(%[ptr_tmp]),0 \n\t" - "vleg %%v24 , 144(%[ptr_tmp]),1 \n\t" - "vleg %%v25 , 152(%[ptr_tmp]),1 \n\t" - "vleg %%v26 , 160(%[ptr_tmp]),0 \n\t" - "vleg %%v27 , 168(%[ptr_tmp]),0 \n\t" - "vleg %%v26 , 176(%[ptr_tmp]),1 \n\t" - "vleg %%v27 , 184(%[ptr_tmp]),1 \n\t" - "vleg %%v28 , 192(%[ptr_tmp]),0 \n\t" - "vleg %%v29 , 200(%[ptr_tmp]),0 \n\t" - "vleg %%v28 , 208(%[ptr_tmp]),1 \n\t" - "vleg %%v29 , 216(%[ptr_tmp]),1 \n\t" - "vleg %%v30 , 224(%[ptr_tmp]),0 \n\t" - "vleg %%v31 , 232(%[ptr_tmp]),0 \n\t" - "vleg %%v30 , 240(%[ptr_tmp]),1 \n\t" - "vleg %%v31 , 248(%[ptr_tmp]),1 \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v24,%%v24,%%v25 \n\t" - "vfadb %%v26,%%v26,%%v27 \n\t" - "vfadb %%v28,%%v28,%%v29 \n\t" - "vfadb %%v30,%%v30,%%v31 \n\t" - - "vfchdb %%v25,%%v1,%%v0 \n\t" - "vsel %%v29,%%v17,%%v16,%%v25 \n\t" - "vsel %%v31,%%v1,%%v0,%%v25 \n\t" - - "vfchdb %%v27,%%v3,%%v2 \n\t " - "vsel %%v0,%%v19,%%v18,%%v27 \n\t" - "vsel %%v1,%%v3,%%v2,%%v27 \n\t" - - "vfchdb %%v25,%%v26,%%v24 \n\t" - "vsel %%v2,%%v21,%%v20,%%v25 \n\t" - "vsel %%v3,%%v26,%%v24,%%v25 \n\t" - - "vfchdb %%v27,%%v30,%%v28 \n\t" - "vsel %%v25,%%v23,%%v22,%%v27 \n\t" - "vsel %%v27,%%v30,%%v28,%%v27 \n\t" - - "vfchdb %%v24, %%v1,%%v31 \n\t" - "vsel %%v26,%%v0,%%v29,%%v24 \n\t" - "vsel %%v28,%%v1,%%v31,%%v24 \n\t" - - "vfchdb %%v30, %%v27,%%v3 \n\t" - "vsel %%v29,%%v25,%%v2,%%v30 \n\t" - "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" - - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - - "vfchdb %%v0, %%v31,%%v28 \n\t" - "vsel %%v25,%%v29,%%v26,%%v0 \n\t" - "vsel %%v27,%%v31,%%v28,%%v0 \n\t" - - "vag %%v25,%%v25,%%v5 \n\t" - - //cmp with previous - "vfchdb %%v30, %%v27,%%v6 \n\t" - "vsel %%v7,%%v25,%%v7,%%v30 \n\t" - "vsel %%v6,%%v27,%%v6,%%v30 \n\t" - - "vag %%v5,%%v5,%%v4 \n\t" - - "clgrjl %[ptr_tmp],%%r0,1b \n\t" + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - //xtract index - "vrepg %%v26,%%v6,1 \n\t" - "vrepg %%v5,%%v7,1 \n\t" - "wfcdb %%v26,%%v6 \n\t" - "jne 2f \n\t" - "vsteg %%v6,%[maxf],0 \n\t" - "vmnlg %%v1,%%v5,%%v7 \n\t" - "vlgvg %[index],%%v1,0 \n\t" - "j 3 \n\t" - "2: \n\t" - "wfchdb %%v16,%%v26,%%v6 \n\t" - "vsel %%v1,%%v5,%%v7,%%v16 \n\t" - "vsel %%v0,%%v26,%%v6,%%v16 \n\t" - "vlgvg %[index],%%v1,0 \n\t" - "std %%f0,%[maxf] \n\t" - "3: \n\t" - : [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x) - : "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" - ); - return index; + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + "vleg %%v16,128(%%r1,%3),0 \n\t" + "vleg %%v17,136(%%r1,%3),0 \n\t" + "vleg %%v16,144(%%r1,%3),1 \n\t" + "vleg %%v17,152(%%r1,%3),1 \n\t" + "vleg %%v18,160(%%r1,%3),0 \n\t" + "vleg %%v19,168(%%r1,%3),0 \n\t" + "vleg %%v18,176(%%r1,%3),1 \n\t" + "vleg %%v19,184(%%r1,%3),1 \n\t" + "vleg %%v20,192(%%r1,%3),0 \n\t" + "vleg %%v21,200(%%r1,%3),0 \n\t" + "vleg %%v20,208(%%r1,%3),1 \n\t" + "vleg %%v21,216(%%r1,%3),1 \n\t" + "vleg %%v22,224(%%r1,%3),0 \n\t" + "vleg %%v23,232(%%r1,%3),0 \n\t" + "vleg %%v22,240(%%r1,%3),1 \n\t" + "vleg %%v23,248(%%r1,%3),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamax),"=m"(*amax) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return iamax; } - - - - - BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; @@ -223,9 +198,9 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG n1 = n & -16; if (n1 > 0) { - max = ziamax_kernel_16_TUNED(n1, x, &maxf); + max = izamax_kernel_16(n1, x, &maxf); + i = n1; - ix = n1 << 1; } while(i < n) @@ -260,7 +235,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } return (max + 1); } - } diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 9b2a653a7..3636e8fdf 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -24,253 +24,217 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" #include -#define ABS fabs -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -/** - * Find minimum index - * Warning: requirements n>0 and n % 16 == 0 - * @param n - * @param x pointer to the vector - * @param minf (out) minimum absolute value .( only for output ) - * @return minimum index - */ -static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { - BLASLONG index ; - __asm__( - "pfd 1, 0(%[ptr_x]) \n\t" - "vleig %%v16,0,0 \n\t" - "vleig %%v16,1,1 \n\t" - "vleig %%v17,2,0 \n\t" - "vleig %%v17,3,1 \n\t" - "vleig %%v18,4,0 \n\t" - "vleig %%v18,5,1 \n\t" - "vleig %%v19,6,0 \n\t" - "vleig %%v19,7,1 \n\t" - "vleig %%v20,8,0 \n\t" - "vleig %%v20,9,1 \n\t" - "vleig %%v21,10,0 \n\t" - "vleig %%v21,11,1 \n\t" - "vleig %%v22,12,0 \n\t" - "vleig %%v22,13,1 \n\t" - "vleig %%v23,14,0 \n\t" - "vleig %%v23,15,1 \n\t" - "ld %%f6,0(%[ptr_x]) \n\t" - "lpdbr %%f6,%%f6 \n\t" - "ld %%f7,8(%[ptr_x]) \n\t" - "lpdbr %%f7,%%f7 \n\t" - "adbr %%f6,%%f7 \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vrepg %%v6,%%v6,0 \n\t" - "vzero %%v7 \n\t" - "vrepig %%v4,16 \n\t" - "vzero %%v5 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" +static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) +{ + BLASLONG iamin; + + __asm__ volatile ( + "vleg %%v0,0(%3),0 \n\t" + "vleg %%v1,8(%3),0 \n\t" + "vleg %%v0,16(%3),1 \n\t" + "vleg %%v1,24(%3),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v1,%%v1 \n\t" + "vfadb %%v0,%%v0,%%v1 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,8 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "srlg %%r0,%2,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vleg %%v16,0(%%r1,%3),0 \n\t" + "vleg %%v17,8(%%r1,%3),0 \n\t" + "vleg %%v16,16(%%r1,%3),1 \n\t" + "vleg %%v17,24(%%r1,%3),1 \n\t" + "vleg %%v18,32(%%r1,%3),0 \n\t" + "vleg %%v19,40(%%r1,%3),0 \n\t" + "vleg %%v18,48(%%r1,%3),1 \n\t" + "vleg %%v19,56(%%r1,%3),1 \n\t" + "vleg %%v20,64(%%r1,%3),0 \n\t" + "vleg %%v21,72(%%r1,%3),0 \n\t" + "vleg %%v20,80(%%r1,%3),1 \n\t" + "vleg %%v21,88(%%r1,%3),1 \n\t" + "vleg %%v22,96(%%r1,%3),0 \n\t" + "vleg %%v23,104(%%r1,%3),0 \n\t" + "vleg %%v22,112(%%r1,%3),1 \n\t" + "vleg %%v23,120(%%r1,%3),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" - "vleg %%v24 , 0(%[ptr_tmp]),0 \n\t" - "vleg %%v25 , 8(%[ptr_tmp]),0 \n\t" - "vleg %%v24 , 16(%[ptr_tmp]),1 \n\t" - "vleg %%v25 , 24(%[ptr_tmp]),1 \n\t" - "vleg %%v26 , 32(%[ptr_tmp]),0 \n\t" - "vleg %%v27 , 40(%[ptr_tmp]),0 \n\t" - "vleg %%v26 , 48(%[ptr_tmp]),1 \n\t" - "vleg %%v27 , 56(%[ptr_tmp]),1 \n\t" - "vleg %%v28 , 64(%[ptr_tmp]),0 \n\t" - "vleg %%v29 , 72(%[ptr_tmp]),0 \n\t" - "vleg %%v28 , 80(%[ptr_tmp]),1 \n\t" - "vleg %%v29 , 88(%[ptr_tmp]),1 \n\t" - "vleg %%v30 , 96(%[ptr_tmp]),0 \n\t" - "vleg %%v31 ,104(%[ptr_tmp]),0 \n\t" - "vleg %%v30 ,112(%[ptr_tmp]),1 \n\t" - "vleg %%v31 ,120(%[ptr_tmp]),1 \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v0,%%v24,%%v25 \n\t" - "vfadb %%v1,%%v26,%%v27 \n\t" - "vfadb %%v2,%%v28,%%v29 \n\t" - "vfadb %%v3,%%v30,%%v31 \n\t" - - - "vleg %%v24 ,128(%[ptr_tmp]),0 \n\t" - "vleg %%v25 ,136(%[ptr_tmp]),0 \n\t" - "vleg %%v24 ,144(%[ptr_tmp]),1 \n\t" - "vleg %%v25 ,152(%[ptr_tmp]),1 \n\t" - "vleg %%v26 ,160(%[ptr_tmp]),0 \n\t" - "vleg %%v27 ,168(%[ptr_tmp]),0 \n\t" - "vleg %%v26 ,176(%[ptr_tmp]),1 \n\t" - "vleg %%v27 ,184(%[ptr_tmp]),1 \n\t" - "vleg %%v28 ,192(%[ptr_tmp]),0 \n\t" - "vleg %%v29 ,200(%[ptr_tmp]),0 \n\t" - "vleg %%v28 ,208(%[ptr_tmp]),1 \n\t" - "vleg %%v29 ,216(%[ptr_tmp]),1 \n\t" - "vleg %%v30 ,224(%[ptr_tmp]),0 \n\t" - "vleg %%v31 ,232(%[ptr_tmp]),0 \n\t" - "vleg %%v30 ,240(%[ptr_tmp]),1 \n\t" - "vleg %%v31 ,248(%[ptr_tmp]),1 \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v24,%%v24,%%v25 \n\t" - "vfadb %%v26,%%v26,%%v27 \n\t" - "vfadb %%v28,%%v28,%%v29 \n\t" - "vfadb %%v30,%%v30,%%v31 \n\t" - - - "vfchdb %%v25,%%v0 ,%%v1 \n\t" - "vsel %%v29,%%v17,%%v16,%%v25 \n\t" - "vsel %%v31,%%v1,%%v0,%%v25 \n\t" - - "vfchdb %%v27,%%v2,%%v3 \n\t" - "vsel %%v0,%%v19,%%v18,%%v27 \n\t" - "vsel %%v1,%%v3,%%v2,%%v27 \n\t" - - "vfchdb %%v25,%%v24,%%v26 \n\t" - "vsel %%v2,%%v21,%%v20,%%v25 \n\t" - "vsel %%v3,%%v26,%%v24,%%v25 \n\t" - - "vfchdb %%v27,%%v28,%%v30 \n\t" - "vsel %%v25,%%v23,%%v22,%%v27 \n\t" - "vsel %%v27,%%v30,%%v28,%%v27 \n\t" - - "vfchdb %%v24,%%v31, %%v1 \n\t" - "vsel %%v26,%%v0,%%v29,%%v24 \n\t" - "vsel %%v28,%%v1,%%v31,%%v24 \n\t" - - "vfchdb %%v30,%%v3, %%v27 \n\t" - "vsel %%v29,%%v25,%%v2,%%v30 \n\t" - "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" - - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - - "vfchdb %%v0,%%v28, %%v31 \n\t" - "vsel %%v25,%%v29,%%v26,%%v0 \n\t" - "vsel %%v27,%%v31,%%v28,%%v0 \n\t" - - "vag %%v25,%%v25,%%v5 \n\t" - - //cmp with previous - "vfchdb %%v30,%%v6 , %%v27 \n\t" - "vsel %%v7,%%v25,%%v7,%%v30 \n\t" - "vsel %%v6,%%v27,%%v6,%%v30 \n\t" - - "vag %%v5,%%v5,%%v4 \n\t" - - "clgrjl %[ptr_tmp],%%r0,1b \n\t" + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - //xtract index - "vrepg %%v26,%%v6,1 \n\t" - "vrepg %%v5,%%v7,1 \n\t" - "wfcdb %%v26,%%v6 \n\t" - "jne 2f \n\t" - "vsteg %%v6,%[minf],0 \n\t" - "vmnlg %%v1,%%v5,%%v7 \n\t" - "vlgvg %[index],%%v1,0 \n\t" - "j 3f \n\t" - "2: \n\t" - "wfchdb %%v16,%%v6 ,%%v26 \n\t" - "vsel %%v1,%%v5,%%v7,%%v16 \n\t" - "vsel %%v0,%%v26,%%v6,%%v16 \n\t" - "vlgvg %[index],%%v1,0 \n\t" - "std %%f0,%[minf] \n\t" - "3: \n\t" + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" - : [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x) - : "cc","r0","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" - ); + "vleg %%v16,128(%%r1,%3),0 \n\t" + "vleg %%v17,136(%%r1,%3),0 \n\t" + "vleg %%v16,144(%%r1,%3),1 \n\t" + "vleg %%v17,152(%%r1,%3),1 \n\t" + "vleg %%v18,160(%%r1,%3),0 \n\t" + "vleg %%v19,168(%%r1,%3),0 \n\t" + "vleg %%v18,176(%%r1,%3),1 \n\t" + "vleg %%v19,184(%%r1,%3),1 \n\t" + "vleg %%v20,192(%%r1,%3),0 \n\t" + "vleg %%v21,200(%%r1,%3),0 \n\t" + "vleg %%v20,208(%%r1,%3),1 \n\t" + "vleg %%v21,216(%%r1,%3),1 \n\t" + "vleg %%v22,224(%%r1,%3),0 \n\t" + "vleg %%v23,232(%%r1,%3),0 \n\t" + "vleg %%v22,240(%%r1,%3),1 \n\t" + "vleg %%v23,248(%%r1,%3),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - return index; + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamin),"=m"(*amin) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return iamin; } - - - - - BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf; - BLASLONG min=0; + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0; + BLASLONG min = 0; BLASLONG inc_x2; if (n <= 0 || inc_x <= 0) return(min); - - + if (inc_x == 1) { - BLASLONG n1 = n & -16; - if (n1 > 0) { + BLASLONG n1 = n & -16; + if (n1 > 0) { + + min = izamin_kernel_16(n1, x, &minf); - min = ziamin_kernel_16_TUNED(n1, x, &minf); i = n1; - ix = n1 << 1; - } - else { - //assign minf - minf = CABS1(x,0); - ix += 2; - i++; - } + } - while(i < n) + while(i < n) + { + if( CABS1(x,ix) < minf ) { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += 2; - i++; + min = i; + minf = CABS1(x,ix); } + ix += 2; + i++; + } return (min + 1); } else { - inc_x2 = 2 * inc_x; + inc_x2 = 2 * inc_x; - minf = CABS1(x,0); + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } ix += inc_x2; i++; - - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } + } return (min + 1); } - } diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c new file mode 100644 index 000000000..1025cfcbf --- /dev/null +++ b/kernel/zarch/samax.c @@ -0,0 +1,210 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpsb %%v0,%%v0 \n\t" + "srlg %%r0,%1,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v24,%%v16,%%v17 \n\t" + "vfchsb %%v25,%%v18,%%v19 \n\t" + "vfchsb %%v26,%%v20,%%v21 \n\t" + "vfchsb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v24,%%v25 \n\t" + "vfchsb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v24,%%v16,%%v17 \n\t" + "vfchsb %%v25,%%v18,%%v19 \n\t" + "vfchsb %%v26,%%v20,%%v21 \n\t" + "vfchsb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v24,%%v25 \n\t" + "vfchsb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + maxf = samax_kernel_64(n1, x); + + i = n1; + } + else + { + maxf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); + + } else { + + maxf=ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c new file mode 100644 index 000000000..3b8f03e6a --- /dev/null +++ b/kernel/zarch/samin.c @@ -0,0 +1,210 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpsb %%v0,%%v0 \n\t" + "srlg %%r0,%1,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vfchsb %%v26,%%v21,%%v20 \n\t" + "vfchsb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v25,%%v24 \n\t" + "vfchsb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vfchsb %%v26,%%v21,%%v20 \n\t" + "vfchsb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v25,%%v24 \n\t" + "vfchsb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + minf = samin_kernel_64(n1, x); + + i = n1; + } + else + { + minf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); + + } else { + + minf=ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/sasum.c b/kernel/zarch/sasum.c new file mode 100644 index 000000000..2c59ab2e5 --- /dev/null +++ b/kernel/zarch/sasum.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) +{ + FLOAT asum; + + __asm__ ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "srlg %%r0,%1,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + "vl %%v20, 64(%%r1,%2) \n\t" + "vl %%v21, 80(%%r1,%2) \n\t" + "vl %%v22, 96(%%r1,%2) \n\t" + "vl %%v23, 112(%%r1,%2) \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfasb %%v0,%%v0,%%v16 \n\t" + "vfasb %%v1,%%v1,%%v17 \n\t" + "vfasb %%v2,%%v2,%%v18 \n\t" + "vfasb %%v3,%%v3,%%v19 \n\t" + "vfasb %%v0,%%v0,%%v20 \n\t" + "vfasb %%v1,%%v1,%%v21 \n\t" + "vfasb %%v2,%%v2,%%v22 \n\t" + "vfasb %%v3,%%v3,%%v23 \n\t" + + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + "vl %%v20, 192(%%r1,%2) \n\t" + "vl %%v21, 208(%%r1,%2) \n\t" + "vl %%v22, 224(%%r1,%2) \n\t" + "vl %%v23, 240(%%r1,%2) \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfasb %%v0,%%v0,%%v16 \n\t" + "vfasb %%v1,%%v1,%%v17 \n\t" + "vfasb %%v2,%%v2,%%v18 \n\t" + "vfasb %%v3,%%v3,%%v19 \n\t" + "vfasb %%v0,%%v0,%%v20 \n\t" + "vfasb %%v1,%%v1,%%v21 \n\t" + "vfasb %%v2,%%v2,%%v22 \n\t" + "vfasb %%v3,%%v3,%%v23 \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vfasb %%v0,%%v0,%%v2 \n\t" + "vfasb %%v0,%%v0,%%v3 \n\t" + "veslg %%v1,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vrepf %%v1,%%v0,2 \n\t" + "aebr %%f0,%%f1 \n\t" + "ler %0,%%f0 " + :"=f"(asum) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" + ); + + return asum; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT sumf = 0.0; + BLASLONG n1; + + if (n <= 0 || inc_x <= 0) return sumf; + + if (inc_x == 1) { + + n1 = n & -64; + + if (n1 > 0) { + + sumf = sasum_kernel_64(n1, x); + i = n1; + } + + while (i < n) { + sumf += ABS(x[i]); + i++; + } + + } else { + BLASLONG n1 = n & -4; + register FLOAT sum1, sum2; + sum1 = 0.0; + sum2 = 0.0; + while (j < n1) { + + sum1 += ABS(x[i]); + sum2 += ABS(x[i + inc_x]); + sum1 += ABS(x[i + 2 * inc_x]); + sum2 += ABS(x[i + 3 * inc_x]); + + i += inc_x * 4; + j += 4; + + } + sumf = sum1 + sum2; + while (j < n) { + + sumf += ABS(x[i]); + i += inc_x; + j++; + } + + + } + return sumf; +} + + diff --git a/kernel/zarch/saxpy.c b/kernel/zarch/saxpy.c new file mode 100644 index 000000000..26ead310c --- /dev/null +++ b/kernel/zarch/saxpy.c @@ -0,0 +1,184 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile( + "vlrepf %%v0,%3 \n\t" + "srlg %%r0,%0,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,0(%%r1,%2) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,32(%%r1,%2) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + + "vfmasb %%v16,%%v0,%%v16,%%v20 \n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21 \n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22 \n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vl %%v25,80(%%r1,%1) \n\t" + "vl %%v26,96(%%r1,%1) \n\t" + "vl %%v27,112(%%r1,%1) \n\t" + "vl %%v28,64(%%r1,%2) \n\t" + "vl %%v29,80(%%r1,%2) \n\t" + "vl %%v30,96(%%r1,%2) \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + + "vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" + "vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" + "vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" + "vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "vl %%v16,128(%%r1,%1) \n\t" + "vl %%v17,144(%%r1,%1) \n\t" + "vl %%v18,160(%%r1,%1) \n\t" + "vl %%v19,176(%%r1,%1) \n\t" + "vl %%v20,128(%%r1,%2) \n\t" + "vl %%v21,144(%%r1,%2) \n\t" + "vl %%v22,160(%%r1,%2) \n\t" + "vl %%v23,176(%%r1,%2) \n\t" + + "vfmasb %%v16,%%v0,%%v16,%%v20 \n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21 \n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22 \n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23 \n\t" + + "vl %%v24,192(%%r1,%1) \n\t" + "vl %%v25,208(%%r1,%1) \n\t" + "vl %%v26,224(%%r1,%1) \n\t" + "vl %%v27,240(%%r1,%1) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" + + "vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" + "vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" + "vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" + "vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" + + "vst %%v16,128(%%r1,%2) \n\t" + "vst %%v17,144(%%r1,%2) \n\t" + "vst %%v18,160(%%r1,%2) \n\t" + "vst %%v19,176(%%r1,%2) \n\t" + "vst %%v20,192(%%r1,%2) \n\t" + "vst %%v21,208(%%r1,%2) \n\t" + "vst %%v22,224(%%r1,%2) \n\t" + "vst %%v23,240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return 0 ; + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -64; + + if ( n1 ) + saxpy_kernel_64(n1, x, y , &da); + + i = n1; + while(i < n) + { + + y[i] += da * x[i] ; + i++ ; + + } + return 0 ; + + + } + + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = da * x[ix] ; + FLOAT m2 = da * x[ix+inc_x] ; + FLOAT m3 = da * x[ix+2*inc_x] ; + FLOAT m4 = da * x[ix+3*inc_x] ; + + y[iy] += m1 ; + y[iy+inc_y] += m2 ; + y[iy+2*inc_y] += m3 ; + y[iy+3*inc_y] += m4 ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + i+=4 ; + + } + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return 0 ; + +} + + diff --git a/kernel/zarch/scopy.c b/kernel/zarch/scopy.c new file mode 100644 index 000000000..ff4227595 --- /dev/null +++ b/kernel/zarch/scopy.c @@ -0,0 +1,85 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "lgr %%r1,%1 \n\t" + "lgr %%r2,%2 \n\t" + "srlg %%r0,%0,6 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1) \n\t" + "pfd 2, 1024(%%r2) \n\t" + "mvc 0(256,%%r2),0(%%r1) \n\t" + "agfi %%r1,256 \n\t" + "agfi %%r2,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y) + :"memory","cc","r0","r1","r2" + ); +} + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + + if (n <= 0) return 0; + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + scopy_kernel_64(n1, x, y); + i = n1; + } + + while (i < n) { + y[i] = x[i]; + i++; + + } + + + } else { + + while (i < n) { + + y[iy] = x[ix]; + ix += inc_x; + iy += inc_y; + i++; + + } + + } + return 0; + + +} diff --git a/kernel/zarch/sdot.c b/kernel/zarch/sdot.c new file mode 100644 index 000000000..fd8c8e445 --- /dev/null +++ b/kernel/zarch/sdot.c @@ -0,0 +1,140 @@ +/*************************************************************************** +Copyright (c) 2013-2018,The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms,with or without +modification,are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice,this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice,this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL +DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + FLOAT dot; + + __asm__ volatile ( + "vzero %%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%3) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,16(%%r1,%3) \n\t" + "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" + "vl %%v27,48(%%r1,%3) \n\t" + "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" + "vl %%v28,64(%%r1,%3) \n\t" + "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" + "vl %%v29,80(%%r1,%3) \n\t" + "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" + "vl %%v30,96(%%r1,%3) \n\t" + "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" + "vl %%v31,112(%%r1,%3) \n\t" + "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vrepf %%v1,%%v0,1 \n\t" + "vrepf %%v2,%%v0,2 \n\t" + "vrepf %%v3,%%v0,3 \n\t" + "aebr %%f0,%%f1 \n\t" + "aebr %%f0,%%f2 \n\t" + "aebr %%f0,%%f3 \n\t" + "ler %0,%%f0 " + :"=f"(dot) + :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return dot; +} + +FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT dot = 0.0 ; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -32; + + if ( n1 ) + dot = sdot_kernel_32(n1,x,y); + + i = n1; + while(i < n) + { + + dot += y[i] * x[i] ; + i++ ; + + } + return(dot); + + + } + + BLASLONG n1 = n & -2; + + while(i < n1) + { + + dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; + ix += inc_x*2 ; + iy += inc_y*2 ; + i+=2 ; + + } + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/zarch/sgemv_n_4.c b/kernel/zarch/sgemv_n_4.c new file mode 100644 index 000000000..92019d732 --- /dev/null +++ b/kernel/zarch/sgemv_n_4.c @@ -0,0 +1,668 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define NBMAX 2048 + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vlrepf %%v0,0(%5) \n\t" + "vlrepf %%v1,4(%5) \n\t" + "vlrepf %%v2,8(%5) \n\t" + "vlrepf %%v3,12(%5) \n\t" + "vlrepf %%v4,%7 \n\t" + "vfmsb %%v0,%%v0,%%v4 \n\t" + "vfmsb %%v1,%%v1,%%v4 \n\t" + "vfmsb %%v2,%%v2,%%v4 \n\t" + "vfmsb %%v3,%%v3,%%v4 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,0(%%r1,%3) \n\t" + "vl %%v19,0(%%r1,%4) \n\t" + "vl %%v20,16(%%r1,%1) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,16(%%r1,%3) \n\t" + "vl %%v23,16(%%r1,%4) \n\t" + "vl %%v24,32(%%r1,%1) \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vl %%v28,48(%%r1,%1) \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vl %%v31,48(%%r1,%4) \n\t" + + "vl %%v4,0(%%r1,%6) \n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,0(%%r1,%6) \n\t" + + "vl %%v4,16(%%r1,%6) \n\t" + "vfmasb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,16(%%r1,%6) \n\t" + + "vl %%v4,32(%%r1,%6) \n\t" + "vfmasb %%v4,%%v24,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v25,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v26,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v27,%%v3,%%v4 \n\t" + "vst %%v4,32(%%r1,%6) \n\t" + + "vl %%v4,48(%%r1,%6) \n\t" + "vfmasb %%v4,%%v28,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v29,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v30,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v31,%%v3,%%v4 \n\t" + "vst %%v4,48(%%r1,%6) \n\t" + + "vl %%v16,64(%%r1,%1) \n\t" + "vl %%v17,64(%%r1,%2) \n\t" + "vl %%v18,64(%%r1,%3) \n\t" + "vl %%v19,64(%%r1,%4) \n\t" + "vl %%v20,80(%%r1,%1) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,80(%%r1,%3) \n\t" + "vl %%v23,80(%%r1,%4) \n\t" + "vl %%v24,96(%%r1,%1) \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vl %%v28,112(%%r1,%1) \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vl %%v31,112(%%r1,%4) \n\t" + + "vl %%v4,64(%%r1,%6) \n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,64(%%r1,%6) \n\t" + + "vl %%v4,80(%%r1,%6) \n\t" + "vfmasb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,80(%%r1,%6) \n\t" + + "vl %%v4,96(%%r1,%6) \n\t" + "vfmasb %%v4,%%v24,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v25,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v26,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v27,%%v3,%%v4 \n\t" + "vst %%v4,96(%%r1,%6) \n\t" + + "vl %%v4,112(%%r1,%6) \n\t" + "vfmasb %%v4,%%v28,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v29,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v30,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v31,%%v3,%%v4 \n\t" + "vst %%v4,112(%%r1,%6) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,0(%%r1,%3) \n\t" + "vl %%v19,0(%%r1,%4) \n\t" + + "vl %%v4,0(%%r1,%6) \n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,0(%%r1,%6) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vlrepf %%v0,0(%3) \n\t" + "vlrepf %%v1,4(%3) \n\t" + "vlrepf %%v2,%5 \n\t" + "vfmsb %%v0,%%v0,%%v2 \n\t" + "vfmsb %%v1,%%v1,%%v2 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,16(%%r1,%1) \n\t" + "vl %%v19,16(%%r1,%2) \n\t" + "vl %%v20,32(%%r1,%1) \n\t" + "vl %%v21,32(%%r1,%2) \n\t" + "vl %%v22,48(%%r1,%1) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "vl %%v24,64(%%r1,%1) \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vl %%v26,80(%%r1,%1) \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vl %%v28,96(%%r1,%1) \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vl %%v30,112(%%r1,%1) \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + + "vl %%v2,0(%%r1,%4) \n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2 \n\t" + "vst %%v2,0(%%r1,%4) \n\t" + + "vl %%v2,16(%%r1,%4) \n\t" + "vfmasb %%v2,%%v18,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v19,%%v1,%%v2 \n\t" + "vst %%v2,16(%%r1,%4) \n\t" + + "vl %%v2,32(%%r1,%4) \n\t" + "vfmasb %%v2,%%v20,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v21,%%v1,%%v2 \n\t" + "vst %%v2,32(%%r1,%4) \n\t" + + "vl %%v2,48(%%r1,%4) \n\t" + "vfmasb %%v2,%%v22,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v23,%%v1,%%v2 \n\t" + "vst %%v2,48(%%r1,%4) \n\t" + + "vl %%v2,64(%%r1,%4) \n\t" + "vfmasb %%v2,%%v24,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v25,%%v1,%%v2 \n\t" + "vst %%v2,64(%%r1,%4) \n\t" + + "vl %%v2,80(%%r1,%4) \n\t" + "vfmasb %%v2,%%v26,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v27,%%v1,%%v2 \n\t" + "vst %%v2,80(%%r1,%4) \n\t" + + "vl %%v2,96(%%r1,%4) \n\t" + "vfmasb %%v2,%%v28,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v29,%%v1,%%v2 \n\t" + "vst %%v2,96(%%r1,%4) \n\t" + + "vl %%v2,112(%%r1,%4) \n\t" + "vfmasb %%v2,%%v30,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v31,%%v1,%%v2 \n\t" + "vst %%v2,112(%%r1,%4) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + + "vl %%v2,0(%%r1,%4) \n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2 \n\t" + "vst %%v2,0(%%r1,%4) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vlrepf %%v0,0(%2) \n\t" + "vlrepf %%v1,%4 \n\t" + "vfmsb %%v0,%%v0,%%v1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,64(%%r1,%1) \n\t" + "vl %%v21,80(%%r1,%1) \n\t" + "vl %%v22,96(%%r1,%1) \n\t" + "vl %%v23,112(%%r1,%1) \n\t" + + "vl %%v1,0(%%r1,%3) \n\t" + "vfmasb %%v1,%%v16,%%v0,%%v1 \n\t" + "vst %%v1,0(%%r1,%3) \n\t" + + "vl %%v1,16(%%r1,%3) \n\t" + "vfmasb %%v1,%%v17,%%v0,%%v1 \n\t" + "vst %%v1,16(%%r1,%3) \n\t" + + "vl %%v1,32(%%r1,%3) \n\t" + "vfmasb %%v1,%%v18,%%v0,%%v1 \n\t" + "vst %%v1,32(%%r1,%3) \n\t" + + "vl %%v1,48(%%r1,%3) \n\t" + "vfmasb %%v1,%%v19,%%v0,%%v1 \n\t" + "vst %%v1,48(%%r1,%3) \n\t" + + "vl %%v1,64(%%r1,%3) \n\t" + "vfmasb %%v1,%%v20,%%v0,%%v1 \n\t" + "vst %%v1,64(%%r1,%3) \n\t" + + "vl %%v1,80(%%r1,%3) \n\t" + "vfmasb %%v1,%%v21,%%v0,%%v1 \n\t" + "vst %%v1,80(%%r1,%3) \n\t" + + "vl %%v1,96(%%r1,%3) \n\t" + "vfmasb %%v1,%%v22,%%v0,%%v1 \n\t" + "vst %%v1,96(%%r1,%3) \n\t" + + "vl %%v1,112(%%r1,%3) \n\t" + "vfmasb %%v1,%%v23,%%v0,%%v1 \n\t" + "vst %%v1,112(%%r1,%3) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + + "vl %%v1,0(%%r1,%3) \n\t" + "vfmasb %%v1,%%v16,%%v0,%%v1 \n\t" + "vst %%v1,0(%%r1,%3) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + for (i = 0; i < n; i++) + { + *dest += src[i]; + dest += inc_dest; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[4]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4 = lda << 2; + FLOAT xbuffer[8],*ybuffer; + + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + + ybuffer = buffer; + + n1 = n >> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*8); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); + } + + + return(0); +} + + diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c new file mode 100644 index 000000000..efc06297f --- /dev/null +++ b/kernel/zarch/sgemv_t_4.c @@ -0,0 +1,826 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define NBMAX 2048 + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 1,1024(%%r1,%5) \n\t" + + "vl %%v16,0(%%r1,%5) \n\t" + "vl %%v17,16(%%r1,%5) \n\t" + "vl %%v18,32(%%r1,%5) \n\t" + "vl %%v19,48(%%r1,%5) \n\t" + "vl %%v20,64(%%r1,%5) \n\t" + "vl %%v21,80(%%r1,%5) \n\t" + "vl %%v22,96(%%r1,%5) \n\t" + "vl %%v23,112(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" + + "vl %%v28,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v28,%%v0 \n\t" + "vl %%v29,16(%%r1,%2) \n\t" + "vfmasb %%v1,%%v17,%%v29,%%v1 \n\t" + "vl %%v30,16(%%r1,%3) \n\t" + "vfmasb %%v2,%%v17,%%v30,%%v2 \n\t" + "vl %%v31,16(%%r1,%4) \n\t" + "vfmasb %%v3,%%v17,%%v31,%%v3 \n\t" + + "vl %%v24,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v24,%%v0 \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vfmasb %%v1,%%v18,%%v25,%%v1 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2 \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vfmasb %%v3,%%v18,%%v27,%%v3 \n\t" + + "vl %%v28,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v28,%%v0 \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vfmasb %%v1,%%v19,%%v29,%%v1 \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vfmasb %%v2,%%v19,%%v30,%%v2 \n\t" + "vl %%v31,48(%%r1,%4) \n\t" + "vfmasb %%v3,%%v19,%%v31,%%v3 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" + "vl %%v26,64(%%r1,%3) \n\t" + "vfmasb %%v2,%%v20,%%v26,%%v2 \n\t" + "vl %%v27,64(%%r1,%4) \n\t" + "vfmasb %%v3,%%v20,%%v27,%%v3 \n\t" + + "vl %%v28,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v28,%%v0 \n\t" + "vl %%v29,80(%%r1,%2) \n\t" + "vfmasb %%v1,%%v21,%%v29,%%v1 \n\t" + "vl %%v30,80(%%r1,%3) \n\t" + "vfmasb %%v2,%%v21,%%v30,%%v2 \n\t" + "vl %%v31,80(%%r1,%4) \n\t" + "vfmasb %%v3,%%v21,%%v31,%%v3 \n\t" + + "vl %%v24,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v24,%%v0 \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vfmasb %%v1,%%v22,%%v25,%%v1 \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vfmasb %%v2,%%v22,%%v26,%%v2 \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vfmasb %%v3,%%v22,%%v27,%%v3 \n\t" + + "vl %%v28,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v28,%%v0 \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vfmasb %%v1,%%v23,%%v29,%%v1 \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vfmasb %%v2,%%v23,%%v30,%%v2 \n\t" + "vl %%v31,112(%%r1,%4) \n\t" + "vfmasb %%v3,%%v23,%%v31,%%v3 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vrepf %%v4,%%v0,1 \n\t" + "aebr %%f0,%%f4 \n\t" + "vrepf %%v4,%%v0,2 \n\t" + "aebr %%f0,%%f4 \n\t" + "vrepf %%v4,%%v0,3 \n\t" + "aebr %%f0,%%f4 \n\t" + "ste %%f0,0(%6) \n\t" + "vrepf %%v4,%%v1,1 \n\t" + "aebr %%f1,%%f4 \n\t" + "vrepf %%v4,%%v1,2 \n\t" + "aebr %%f1,%%f4 \n\t" + "vrepf %%v4,%%v1,3 \n\t" + "aebr %%f1,%%f4 \n\t" + "ste %%f1,4(%6) \n\t" + "vrepf %%v4,%%v2,1 \n\t" + "aebr %%f2,%%f4 \n\t" + "vrepf %%v4,%%v2,2 \n\t" + "aebr %%f2,%%f4 \n\t" + "vrepf %%v4,%%v2,3 \n\t" + "aebr %%f2,%%f4 \n\t" + "ste %%f2,8(%6) \n\t" + "vrepf %%v4,%%v3,1 \n\t" + "aebr %%f3,%%f4 \n\t" + "vrepf %%v4,%%v3,2 \n\t" + "aebr %%f3,%%f4 \n\t" + "vrepf %%v4,%%v3,3 \n\t" + "aebr %%f3,%%f4 \n\t" + "ste %%f3,12(%6) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + + "vl %%v26,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v26,%%v0 \n\t" + "vl %%v27,16(%%r1,%2) \n\t" + "vfmasb %%v1,%%v17,%%v27,%%v1 \n\t" + + "vl %%v28,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v28,%%v0 \n\t" + "vl %%v29,32(%%r1,%2) \n\t" + "vfmasb %%v1,%%v18,%%v29,%%v1 \n\t" + + "vl %%v30,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v30,%%v0 \n\t" + "vl %%v31,48(%%r1,%2) \n\t" + "vfmasb %%v1,%%v19,%%v31,%%v1 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" + + "vl %%v26,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v26,%%v0 \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vfmasb %%v1,%%v21,%%v27,%%v1 \n\t" + + "vl %%v28,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v28,%%v0 \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vfmasb %%v1,%%v22,%%v29,%%v1 \n\t" + + "vl %%v30,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v30,%%v0 \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + "vfmasb %%v1,%%v23,%%v31,%%v1 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepf %%v2,%%v0,1 \n\t" + "aebr %%f0,%%f2 \n\t" + "vrepf %%v2,%%v0,2 \n\t" + "aebr %%f0,%%f2 \n\t" + "vrepf %%v2,%%v0,3 \n\t" + "aebr %%f0,%%f2 \n\t" + "ste %%f0,0(%4) \n\t" + "vrepf %%v2,%%v1,1 \n\t" + "aebr %%f1,%%f2 \n\t" + "vrepf %%v2,%%v1,2 \n\t" + "aebr %%f1,%%f2 \n\t" + "vrepf %%v2,%%v1,3 \n\t" + "aebr %%f1,%%f2 \n\t" + "ste %%f1,4(%4) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + + "vl %%v25,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" + + "vl %%v26,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" + + "vl %%v27,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" + + "vl %%v28,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" + + "vl %%v29,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" + + "vl %%v30,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" + + "vl %%v31,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepf %%v1,%%v0,1 \n\t" + "aebr %%f0,%%f1 \n\t" + "vrepf %%v1,%%v0,2 \n\t" + "aebr %%f0,%%f1 \n\t" + "vrepf %%v1,%%v0,3 \n\t" + "aebr %%f0,%%f1 \n\t" + "ste %%f0,0(%3) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for (i = 0; i < n; i++) + { + dest[i] = *src; + src += inc_src; + } +} + +static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) +{ + __asm__ volatile ( + "vlrepf %%v0,%1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24, 0(%%r1,%3) \n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vst %%v24, 0(%%r1,%3) \n\t" + "vl %%v25, 16(%%r1,%3) \n\t" + "vfmasb %%v25,%%v17,%%v0,%%v25 \n\t" + "vst %%v25, 16(%%r1,%3) \n\t" + "vl %%v26, 32(%%r1,%3) \n\t" + "vfmasb %%v26,%%v18,%%v0,%%v26 \n\t" + "vst %%v26, 32(%%r1,%3) \n\t" + "vl %%v27, 48(%%r1,%3) \n\t" + "vfmasb %%v27,%%v19,%%v0,%%v27 \n\t" + "vst %%v27, 48(%%r1,%3) \n\t" + "vl %%v28, 64(%%r1,%3) \n\t" + "vfmasb %%v28,%%v20,%%v0,%%v28 \n\t" + "vst %%v28, 64(%%r1,%3) \n\t" + "vl %%v29, 80(%%r1,%3) \n\t" + "vfmasb %%v29,%%v21,%%v0,%%v29 \n\t" + "vst %%v29, 80(%%r1,%3) \n\t" + "vl %%v30, 96(%%r1,%3) \n\t" + "vfmasb %%v30,%%v22,%%v0,%%v30 \n\t" + "vst %%v30, 96(%%r1,%3) \n\t" + "vl %%v31, 112(%%r1,%3) \n\t" + "vfmasb %%v31,%%v23,%%v0,%%v31 \n\t" + "vst %%v31, 112(%%r1,%3) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + + "vl %%v24, 0(%%r1,%3) \n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vst %%v24, 0(%%r1,%3) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} +static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + if (inc_dest == 1) + add_y_kernel_4(n, da, src, dest); + else + { + BLASLONG i; + for (i = 0; i < n; i++) + { + *dest += src[i] * da; + dest += inc_dest; + } + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG register i; + BLASLONG register j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + BLASLONG n0; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT ybuffer[2] __attribute__ ((aligned(16))); + FLOAT *xbuffer; + FLOAT *ytemp; + + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + + xbuffer = buffer; + ytemp = buffer + (m < NBMAX ? m : NBMAX); + + n0 = n / NBMAX; + n1 = (n % NBMAX) >> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if ( inc_x == 1 ) + xbuffer = x_ptr; + else + copy_x(NB,x_ptr,xbuffer,inc_x); + + + FLOAT *ap[4]; + FLOAT *yp; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( n0 > 0 ) + { + BLASLONG nb1 = NBMAX / 4; + for( j=0; j 0 ) + { + add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4 ; + } + + if ( n2 & 2 ) + { + + sgemv_kernel_4x2(NB,ap,xbuffer,ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; + + } + + if ( n2 & 1 ) + { + + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); + a_ptr += lda; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + + } + a += NB; + x += NB * inc_x; + } + + if ( m3 == 0 ) return(0); + + x_ptr = x; + a_ptr = a; + if ( m3 == 3 ) + { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if ( lda == 3 && inc_y == 1 ) + { + + for ( j=0; j< ( n & -4) ; j+=4 ) + { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for ( ; j 0) { + + maxf = smax_kernel_64(n1, x); + + i = n1; + } + else + { + maxf=x[0]; + i++; + } + + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { + + maxf=x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c new file mode 100644 index 000000000..e882b7ff1 --- /dev/null +++ b/kernel/zarch/smin.c @@ -0,0 +1,186 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) +{ + FLOAT min; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "srlg %%r0,%1,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vfchsb %%v26,%%v21,%%v20 \n\t" + "vfchsb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v25,%%v24 \n\t" + "vfchsb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vfchsb %%v26,%%v21,%%v20 \n\t" + "vfchsb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v25,%%v24 \n\t" + "vfchsb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return min; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + minf = smin_kernel_64(n1, x); + + i = n1; + } + else + { + minf=x[0]; + i++; + } + + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { + + minf=x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/srot.c b/kernel/zarch/srot.c new file mode 100644 index 000000000..763cc664a --- /dev/null +++ b/kernel/zarch/srot.c @@ -0,0 +1,246 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) +{ + __asm__ ( + "vlrepf %%v0,%3 \n\t" + "vlrepf %%v1,%4 \n\t" + "srlg %%r0,%0,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%1) \n\t" + "vst %%v29, 16(%%r1,%1) \n\t" + "vst %%v30, 32(%%r1,%1) \n\t" + "vst %%v31, 48(%%r1,%1) \n\t" + "vst %%v20, 0(%%r1,%2) \n\t" + "vst %%v21, 16(%%r1,%2) \n\t" + "vst %%v22, 32(%%r1,%2) \n\t" + "vst %%v23, 48(%%r1,%2) \n\t" + + "vl %%v24, 64(%%r1,%1) \n\t" + "vl %%v25, 80(%%r1,%1) \n\t" + "vl %%v26, 96(%%r1,%1) \n\t" + "vl %%v27, 112(%%r1,%1) \n\t" + "vl %%v16, 64(%%r1,%2) \n\t" + "vl %%v17, 80(%%r1,%2) \n\t" + "vl %%v18, 96(%%r1,%2) \n\t" + "vl %%v19, 112(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%1) \n\t" + "vst %%v29, 80(%%r1,%1) \n\t" + "vst %%v30, 96(%%r1,%1) \n\t" + "vst %%v31, 112(%%r1,%1) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%1) \n\t" + "vst %%v29, 144(%%r1,%1) \n\t" + "vst %%v30, 160(%%r1,%1) \n\t" + "vst %%v31, 176(%%r1,%1) \n\t" + "vst %%v20, 128(%%r1,%2) \n\t" + "vst %%v21, 144(%%r1,%2) \n\t" + "vst %%v22, 160(%%r1,%2) \n\t" + "vst %%v23, 176(%%r1,%2) \n\t" + + "vl %%v24, 192(%%r1,%1) \n\t" + "vl %%v25, 208(%%r1,%1) \n\t" + "vl %%v26, 224(%%r1,%1) \n\t" + "vl %%v27, 240(%%r1,%1) \n\t" + "vl %%v16, 192(%%r1,%2) \n\t" + "vl %%v17, 208(%%r1,%2) \n\t" + "vl %%v18, 224(%%r1,%2) \n\t" + "vl %%v19, 240(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 192(%%r1,%1) \n\t" + "vst %%v29, 208(%%r1,%1) \n\t" + "vst %%v30, 224(%%r1,%1) \n\t" + "vst %%v31, 240(%%r1,%1) \n\t" + "vst %%v20, 192(%%r1,%2) \n\t" + "vst %%v21, 208(%%r1,%2) \n\t" + "vst %%v22, 224(%%r1,%2) \n\t" + "vst %%v23, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT temp; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -64; + if ( n1 > 0 ) + { + FLOAT cosa,sina; + cosa=c; + sina=s; + srot_kernel_64(n1, x, y, &cosa, &sina); + i=n1; + } + + while(i < n) + { + temp = c*x[i] + s*y[i] ; + y[i] = c*y[i] - s*x[i] ; + x[i] = temp ; + + i++ ; + + } + + + } + else + { + + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + +} + + diff --git a/kernel/zarch/sscal.c b/kernel/zarch/sscal.c new file mode 100644 index 000000000..c18a7e56f --- /dev/null +++ b/kernel/zarch/sscal.c @@ -0,0 +1,201 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) +{ + __asm__ volatile ( + "vlrepf %%v0,%1 \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%2) \n\t" + "vfmsb %%v24,%%v24,%%v0 \n\t" + "vst %%v24, 0(%%r1,%2) \n\t" + "vl %%v25, 16(%%r1,%2) \n\t" + "vfmsb %%v25,%%v25,%%v0 \n\t" + "vst %%v25, 16(%%r1,%2) \n\t" + "vl %%v26, 32(%%r1,%2) \n\t" + "vfmsb %%v26,%%v26,%%v0 \n\t" + "vst %%v26, 32(%%r1,%2) \n\t" + "vl %%v27, 48(%%r1,%2) \n\t" + "vfmsb %%v27,%%v27,%%v0 \n\t" + "vst %%v27, 48(%%r1,%2) \n\t" + "vl %%v24, 64(%%r1,%2) \n\t" + "vfmsb %%v24,%%v24,%%v0 \n\t" + "vst %%v24, 64(%%r1,%2) \n\t" + "vl %%v25, 80(%%r1,%2) \n\t" + "vfmsb %%v25,%%v25,%%v0 \n\t" + "vst %%v25, 80(%%r1,%2) \n\t" + "vl %%v26, 96(%%r1,%2) \n\t" + "vfmsb %%v26,%%v26,%%v0 \n\t" + "vst %%v26, 96(%%r1,%2) \n\t" + "vl %%v27, 112(%%r1,%2) \n\t" + "vfmsb %%v27,%%v27,%%v0 \n\t" + "vst %%v27, 112(%%r1,%2) \n\t" + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v24","v25","v26","v27" + ); +} + +static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) +{ + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + + "vst %%v24,0(%%r1,%1) \n\t" + "vst %%v25,16(%%r1,%1) \n\t" + "vst %%v26,32(%%r1,%1) \n\t" + "vst %%v27,48(%%r1,%1) \n\t" + "vst %%v24,64(%%r1,%1) \n\t" + "vst %%v25,80(%%r1,%1) \n\t" + "vst %%v26,96(%%r1,%1) \n\t" + "vst %%v27,112(%%r1,%1) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x) + :"memory","cc","r0","r1","v24","v25","v26","v27" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0,j=0; + if ( n <= 0 || inc_x <=0 ) + return(0); + + + if ( inc_x == 1 ) + { + + if ( da == 0.0 ) + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + + sscal_kernel_32_zero(n1, x); + j=n1; + } + + while(j < n) + { + + x[j]=0.0; + j++; + } + + } + else + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + sscal_kernel_32(n1, da, x); + j=n1; + } + while(j < n) + { + + x[j] = da * x[j] ; + j++; + } + } + + + } + else + { + + if ( da == 0.0 ) + { + + BLASLONG n1 = n & -2; + + while (j < n1) { + + x[i]=0.0; + x[i + inc_x]=0.0; + + i += inc_x * 2; + j += 2; + + } + while(j < n) + { + + x[i]=0.0; + i += inc_x ; + j++; + } + + } + else + { + BLASLONG n1 = n & -2; + + while (j < n1) { + + x[i] = da * x[i] ; + x[i + inc_x] = da * x[i + inc_x]; + + i += inc_x * 2; + j += 2; + + } + + while(j < n) + { + + x[i] = da * x[i] ; + i += inc_x ; + j++; + } + } + + } + return 0; + +} + + diff --git a/kernel/zarch/sswap.c b/kernel/zarch/sswap.c new file mode 100644 index 000000000..d0c0dc3f4 --- /dev/null +++ b/kernel/zarch/sswap.c @@ -0,0 +1,164 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile( + "srlg %%r0,%0,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" + + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vl %%v4, 64(%%r1,%2) \n\t" + "vl %%v5, 80(%%r1,%2) \n\t" + "vl %%v6, 96(%%r1,%2) \n\t" + "vl %%v7, 112(%%r1,%2) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v1, 16(%%r1,%1) \n\t" + "vst %%v2, 32(%%r1,%1) \n\t" + "vst %%v3, 48(%%r1,%1) \n\t" + "vst %%v4, 64(%%r1,%1) \n\t" + "vst %%v5, 80(%%r1,%1) \n\t" + "vst %%v6, 96(%%r1,%1) \n\t" + "vst %%v7, 112(%%r1,%1) \n\t" + + "vl %%v0, 128(%%r1,%2) \n\t" + "vl %%v1, 144(%%r1,%2) \n\t" + "vl %%v2, 160(%%r1,%2) \n\t" + "vl %%v3, 176(%%r1,%2) \n\t" + "vl %%v4, 192(%%r1,%2) \n\t" + "vl %%v5, 208(%%r1,%2) \n\t" + "vl %%v6, 224(%%r1,%2) \n\t" + "vl %%v7, 240(%%r1,%2) \n\t" + "vst %%v0, 128(%%r1,%1) \n\t" + "vst %%v1, 144(%%r1,%1) \n\t" + "vst %%v2, 160(%%r1,%1) \n\t" + "vst %%v3, 176(%%r1,%1) \n\t" + "vst %%v4, 192(%%r1,%1) \n\t" + "vst %%v5, 208(%%r1,%1) \n\t" + "vst %%v6, 224(%%r1,%1) \n\t" + "vst %%v7, 240(%%r1,%1) \n\t" + + "vst %%v16, 0(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1 )) + { + + BLASLONG n1 = n & -64; + if ( n1 > 0 ) + { + sswap_kernel_64(n1, x, y); + i=n1; + } + + while(i < n) + { + temp = y[i]; + y[i] = x[i] ; + x[i] = temp; + i++ ; + + } + + + } + else + { + + while(i < n) + { + temp = y[iy]; + y[iy] = x[ix] ; + x[ix] = temp; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c new file mode 100644 index 000000000..6393b099b --- /dev/null +++ b/kernel/zarch/zamax.c @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vleg %%v0,0(%2),0 \n\t" + "vleg %%v16,8(%2),0 \n\t" + "vleg %%v0,16(%2),1 \n\t" + "vleg %%v16,24(%2),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v16,%%v16 \n\t" + "vfadb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vleg %%v16,0(%%r1,%2),0 \n\t" + "vleg %%v17,8(%%r1,%2),0 \n\t" + "vleg %%v16,16(%%r1,%2),1 \n\t" + "vleg %%v17,24(%%r1,%2),1 \n\t" + "vleg %%v18,32(%%r1,%2),0 \n\t" + "vleg %%v19,40(%%r1,%2),0 \n\t" + "vleg %%v18,48(%%r1,%2),1 \n\t" + "vleg %%v19,56(%%r1,%2),1 \n\t" + "vleg %%v20,64(%%r1,%2),0 \n\t" + "vleg %%v21,72(%%r1,%2),0 \n\t" + "vleg %%v20,80(%%r1,%2),1 \n\t" + "vleg %%v21,88(%%r1,%2),1 \n\t" + "vleg %%v22,96(%%r1,%2),0 \n\t" + "vleg %%v23,104(%%r1,%2),0 \n\t" + "vleg %%v22,112(%%r1,%2),1 \n\t" + "vleg %%v23,120(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vleg %%v16,128(%%r1,%2),0 \n\t" + "vleg %%v17,136(%%r1,%2),0 \n\t" + "vleg %%v16,144(%%r1,%2),1 \n\t" + "vleg %%v17,152(%%r1,%2),1 \n\t" + "vleg %%v18,160(%%r1,%2),0 \n\t" + "vleg %%v19,168(%%r1,%2),0 \n\t" + "vleg %%v18,176(%%r1,%2),1 \n\t" + "vleg %%v19,184(%%r1,%2),1 \n\t" + "vleg %%v20,192(%%r1,%2),0 \n\t" + "vleg %%v21,200(%%r1,%2),0 \n\t" + "vleg %%v20,208(%%r1,%2),1 \n\t" + "vleg %%v21,216(%%r1,%2),1 \n\t" + "vleg %%v22,224(%%r1,%2),0 \n\t" + "vleg %%v23,232(%%r1,%2),0 \n\t" + "vleg %%v22,240(%%r1,%2),1 \n\t" + "vleg %%v23,248(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + maxf = zamax_kernel_16(n1, x); + + i = n1; + } + else + { + maxf=CABS1(x,0); + i++; + } + + while (i < n) { + if (ABS(x[i*2]) > maxf) { + maxf = ABS(x[i*2]); + } + i++; + } + return (maxf); + + } else { + + inc_x2 = 2 * inc_x; + maxf=CABS1(x,0); + i += inc_x2; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (CABS1(x,i) > maxf) { + maxf = CABS1(x,i); + } + if (CABS1(x,i+inc_x2) > maxf) { + maxf = CABS1(x,i+inc_x2); + } + if (CABS1(x,i+inc_x2*2) > maxf) { + maxf = CABS1(x,i+inc_x2*2); + } + if (CABS1(x,i+inc_x2*3) > maxf) { + maxf = CABS1(x,i+inc_x2*3); + } + + i += inc_x2 * 4; + + j += 4; + + } + + + while (j < n) { + if (CABS1(x,i) > maxf) { + maxf = CABS1(x,i); + } + i += inc_x2; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c new file mode 100644 index 000000000..b15774bb9 --- /dev/null +++ b/kernel/zarch/zamin.c @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vleg %%v0,0(%2),0 \n\t" + "vleg %%v16,8(%2),0 \n\t" + "vleg %%v0,16(%2),1 \n\t" + "vleg %%v16,24(%2),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v16,%%v16 \n\t" + "vfadb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vleg %%v16,0(%%r1,%2),0 \n\t" + "vleg %%v17,8(%%r1,%2),0 \n\t" + "vleg %%v16,16(%%r1,%2),1 \n\t" + "vleg %%v17,24(%%r1,%2),1 \n\t" + "vleg %%v18,32(%%r1,%2),0 \n\t" + "vleg %%v19,40(%%r1,%2),0 \n\t" + "vleg %%v18,48(%%r1,%2),1 \n\t" + "vleg %%v19,56(%%r1,%2),1 \n\t" + "vleg %%v20,64(%%r1,%2),0 \n\t" + "vleg %%v21,72(%%r1,%2),0 \n\t" + "vleg %%v20,80(%%r1,%2),1 \n\t" + "vleg %%v21,88(%%r1,%2),1 \n\t" + "vleg %%v22,96(%%r1,%2),0 \n\t" + "vleg %%v23,104(%%r1,%2),0 \n\t" + "vleg %%v22,112(%%r1,%2),1 \n\t" + "vleg %%v23,120(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vleg %%v16,128(%%r1,%2),0 \n\t" + "vleg %%v17,136(%%r1,%2),0 \n\t" + "vleg %%v16,144(%%r1,%2),1 \n\t" + "vleg %%v17,152(%%r1,%2),1 \n\t" + "vleg %%v18,160(%%r1,%2),0 \n\t" + "vleg %%v19,168(%%r1,%2),0 \n\t" + "vleg %%v18,176(%%r1,%2),1 \n\t" + "vleg %%v19,184(%%r1,%2),1 \n\t" + "vleg %%v20,192(%%r1,%2),0 \n\t" + "vleg %%v21,200(%%r1,%2),0 \n\t" + "vleg %%v20,208(%%r1,%2),1 \n\t" + "vleg %%v21,216(%%r1,%2),1 \n\t" + "vleg %%v22,224(%%r1,%2),0 \n\t" + "vleg %%v23,232(%%r1,%2),0 \n\t" + "vleg %%v22,240(%%r1,%2),1 \n\t" + "vleg %%v23,248(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + minf = zamin_kernel_16(n1, x); + + i = n1; + } + else + { + minf=CABS1(x,0); + i++; + } + + while (i < n) { + if (ABS(x[i*2]) < minf) { + minf = ABS(x[i*2]); + } + i++; + } + return (minf); + + } else { + + inc_x2 = 2 * inc_x; + minf=CABS1(x,0); + i += inc_x2; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (CABS1(x,i) < minf) { + minf = CABS1(x,i); + } + if (CABS1(x,i+inc_x2) < minf) { + minf = CABS1(x,i+inc_x2); + } + if (CABS1(x,i+inc_x2*2) < minf) { + minf = CABS1(x,i+inc_x2*2); + } + if (CABS1(x,i+inc_x2*3) < minf) { + minf = CABS1(x,i+inc_x2*3); + } + + i += inc_x2 * 4; + + j += 4; + + } + + + while (j < n) { + if (CABS1(x,i) < minf) { + minf = CABS1(x,i); + } + i += inc_x2; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index 0fc5c9ecb..8faaf20eb 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -25,92 +25,98 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" #include #if defined(DOUBLE) - #define ABS fabs - #else - #define ABS fabsf - #endif - -static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { - +static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) +{ FLOAT asum; + __asm__ ( - "pfd 1, 0(%[ptr_x]) \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v22 \n\t" - "vzero %%v23 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" - "vlm %%v24,%%v31,0(%[ptr_tmp]) \n\t" - - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v0,%%v0,%%v24 \n\t" - "vfadb %%v1,%%v1,%%v25 \n\t" - "vfadb %%v23,%%v23,%%v26 \n\t" - "vfadb %%v22,%%v22,%%v27 \n\t" - "vfadb %%v0,%%v0,%%v28 \n\t" - "vfadb %%v1,%%v1,%%v29 \n\t" - "vfadb %%v23,%%v23,%%v30 \n\t" - "vfadb %%v22,%%v22,%%v31 \n\t" - - "vlm %%v24,%%v31, 128(%[ptr_tmp]) \n\t" - - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - "vfadb %%v0,%%v0,%%v24 \n\t" - "vfadb %%v1,%%v1,%%v25 \n\t" - "vfadb %%v23,%%v23,%%v26 \n\t" - "vfadb %%v22,%%v22,%%v27 \n\t" - "vfadb %%v0,%%v0,%%v28 \n\t" - "vfadb %%v1,%%v1,%%v29 \n\t" - "vfadb %%v23,%%v23,%%v30 \n\t" - "vfadb %%v22,%%v22,%%v31 \n\t" - - "clgrjl %[ptr_tmp],%%r0,1b \n\t" - "vfadb %%v24,%%v0,%%v1 \n\t" - "vfadb %%v25,%%v23,%%v22 \n\t" - "vfadb %%v0,%%v25,%%v24 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %[asum] ,%%f0" - : [asum] "=f"(asum),[ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[2*n])x ), [n] "r"(n), [ptr_x] "a"(x) - : "cc", "r0","f0","f1","v0","v1","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + "vl %%v20, 64(%%r1,%2) \n\t" + "vl %%v21, 80(%%r1,%2) \n\t" + "vl %%v22, 96(%%r1,%2) \n\t" + "vl %%v23, 112(%%r1,%2) \n\t" + + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfadb %%v0,%%v0,%%v16 \n\t" + "vfadb %%v1,%%v1,%%v17 \n\t" + "vfadb %%v2,%%v2,%%v18 \n\t" + "vfadb %%v3,%%v3,%%v19 \n\t" + "vfadb %%v0,%%v0,%%v20 \n\t" + "vfadb %%v1,%%v1,%%v21 \n\t" + "vfadb %%v2,%%v2,%%v22 \n\t" + "vfadb %%v3,%%v3,%%v23 \n\t" + + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + "vl %%v20, 192(%%r1,%2) \n\t" + "vl %%v21, 208(%%r1,%2) \n\t" + "vl %%v22, 224(%%r1,%2) \n\t" + "vl %%v23, 240(%%r1,%2) \n\t" + + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfadb %%v0,%%v0,%%v16 \n\t" + "vfadb %%v1,%%v1,%%v17 \n\t" + "vfadb %%v2,%%v2,%%v18 \n\t" + "vfadb %%v3,%%v3,%%v19 \n\t" + "vfadb %%v0,%%v0,%%v20 \n\t" + "vfadb %%v1,%%v1,%%v21 \n\t" + "vfadb %%v2,%%v2,%%v22 \n\t" + "vfadb %%v3,%%v3,%%v23 \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b \n\t" + "vfadb %%v0,%%v0,%%v1 \n\t" + "vfadb %%v0,%%v0,%%v2 \n\t" + "vfadb %%v0,%%v0,%%v3 \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "ldr %0,%%f0 " + :"=f"(asum) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" + ); + return asum; - } - - FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; @@ -128,7 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( n1 > 0 ) { - sumf=zasum_kernel_16(n1, x ); + sumf = zasum_kernel_16(n1, x); i=n1; ip=2*n1; } diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 212de25c8..6ba44a27c 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -23,142 +23,98 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - +*****************************************************************************/ #include "common.h" - -static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) { - - BLASLONG tempR1 ; - __asm__ ("pfd 1, 0(%[x_tmp]) \n\t" - "pfd 2, 0(%[y_tmp]) \n\t" +static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile( #if !defined(CONJ) - "lgdr %[t1],%[alpha_r] \n\t" - "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint - "lgdr %[t1],%[alpha_i] \n\t" - "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint - "vflcdb %%v29,%%v29 \n\t" //complement both - "vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i} + "vlrepg %%v0,0(%3) \n\t" + "vleg %%v1,8(%3),0 \n\t" + "wflcdb %%v1,%%v1 \n\t" + "vleg %%v1,8(%3),1 \n\t" +#else + "vleg %%v0,0(%3),1 \n\t" + "vflcdb %%v0,%%v0 \n\t" + "vleg %%v0,0(%3),0 \n\t" + "vlrepg %%v1,8(%3) \n\t" +#endif + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" -#else - "lgdr %[t1],%[alpha_i] \n\t" - "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint - "lgdr %[t1],%[alpha_r] \n\t" - "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint - "vflcdb %%v28,%%v28 \n\t" //complement both - "vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r} -#endif - - "xgr %[t1],%[t1] \n\t" - "sllg %[tmp],%[tmp],4 \n\t" - "vl %%v30 , 0(%[t1],%[y_tmp]) \n\t" - "vl %%v31 , 16(%[t1],%[y_tmp]) \n\t" - "vl %%v6 , 32(%[t1],%[y_tmp]) \n\t" - "vl %%v7 , 48(%[t1],%[y_tmp]) \n\t" - "vl %%v20 , 0(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 16(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 32(%[t1],%[x_tmp]) \n\t" - "vl %%v23 , 48(%[t1],%[x_tmp]) \n\t" - "lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition - "j 2f \n\t" - ".align 16 \n\t" - "1: \n\t" - - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - "vfmadb %%v16, %%v20, %%v28, %%v16 \n\t" - "vfmadb %%v17, %%v21, %%v28, %%v17 \n\t" - "vfmadb %%v18, %%v22, %%v28, %%v18 \n\t" - "vfmadb %%v19, %%v23, %%v28, %%v19 \n\t" - "vl %%v30, 64(%[t1],%[y_tmp]) \n\t" - "vl %%v31, 80(%[t1],%[y_tmp]) \n\t" - "vl %%v6 , 96(%[t1],%[y_tmp]) \n\t" - "vl %%v7 , 112(%[t1],%[y_tmp]) \n\t" - "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" - "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" - "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" - "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" - "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" - "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,0(%%r1,%2) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,32(%%r1,%2) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "vpdi %%v24,%%v16,%%v16,4 \n\t" + "vpdi %%v25,%%v17,%%v17,4 \n\t" + "vpdi %%v26,%%v18,%%v18,4 \n\t" + "vpdi %%v27,%%v19,%%v19,4 \n\t" - "vst %%v16 , 0(%[t1],%[y_tmp]) \n\t" - "vst %%v17 , 16(%[t1],%[y_tmp]) \n\t" - "vst %%v18 , 32(%[t1],%[y_tmp]) \n\t" - "vst %%v19 , 48(%[t1],%[y_tmp]) \n\t" - - "la %[t1],64(%[t1] ) \n\t" - "2: \n\t" - "pfd 1, 256(%[t1],%[x_tmp]) \n\t" - "pfd 2, 256(%[t1],%[y_tmp]) \n\t" - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" + "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" - "vfmadb %%v30, %%v20, %%v28, %%v30 \n\t" - "vfmadb %%v31, %%v21, %%v28, %%v31 \n\t" - "vfmadb %%v6, %%v22, %%v28, %%v6 \n\t" - "vfmadb %%v7, %%v23, %%v28, %%v7 \n\t" - "vl %%v16, 64(%[t1],%[y_tmp]) \n\t" - "vl %%v17, 80(%[t1],%[y_tmp]) \n\t" - "vl %%v18, 96(%[t1],%[y_tmp]) \n\t" - "vl %%v19, 112(%[t1],%[y_tmp]) \n\t" - "vfmadb %%v30, %%v24, %%v29, %%v30 \n\t" - "vfmadb %%v31, %%v25, %%v29, %%v31 \n\t" - "vfmadb %%v6, %%v26, %%v29, %%v6 \n\t" - "vfmadb %%v7, %%v27, %%v29, %%v7 \n\t" + "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" - "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" - "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" + "vst %%v28,0(%%r1,%2) \n\t" + "vst %%v29,16(%%r1,%2) \n\t" + "vst %%v30,32(%%r1,%2) \n\t" + "vst %%v31,48(%%r1,%2) \n\t" - "vst %%v30 , 0(%[t1],%[y_tmp]) \n\t" - "vst %%v31 , 16(%[t1],%[y_tmp]) \n\t" - "vst %%v6 , 32(%[t1],%[y_tmp]) \n\t" - "vst %%v7 , 48(%[t1],%[y_tmp]) \n\t" - - "la %[t1],64(%[t1] ) \n\t" - + "vl %%v16,64(%%r1,%1) \n\t" + "vl %%v17,80(%%r1,%1) \n\t" + "vl %%v18,96(%%r1,%1) \n\t" + "vl %%v19,112(%%r1,%1) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vpdi %%v24,%%v16,%%v16,4 \n\t" + "vpdi %%v25,%%v17,%%v17,4 \n\t" + "vpdi %%v26,%%v18,%%v18,4 \n\t" + "vpdi %%v27,%%v19,%%v19,4 \n\t" - "clgrjl %[t1],%[tmp],1b \n\t" -//---------------------------------------------------------------------- - "vfmadb %%v16, %%v20, %%v28, %%v16 \n\t" - "vfmadb %%v17, %%v21, %%v28, %%v17 \n\t" - "vfmadb %%v18, %%v22, %%v28, %%v18 \n\t" - "vfmadb %%v19, %%v23, %%v28, %%v19 \n\t" - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" - "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" - "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" - "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" + "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" - "vst %%v16 , 0(%[t1],%[y_tmp]) \n\t" - "vst %%v17 , 16(%[t1],%[y_tmp]) \n\t" - "vst %%v18 , 32(%[t1],%[y_tmp]) \n\t" - "vst %%v19 , 48(%[t1],%[y_tmp]) \n\t" + "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" - : [mem_y] "+m" (*(double (*)[2*n])y),[tmp]"+&r"(n) , [t1] "=&a" (tempR1) - : [mem_x] "m" (*(const double (*)[2*n])x), [x_tmp] "a"(x), [y_tmp] "a"(y), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) - : "cc", "v6","v7", "v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "vst %%v28,64(%%r1,%2) \n\t" + "vst %%v29,80(%%r1,%2) \n\t" + "vst %%v30,96(%%r1,%2) \n\t" + "vst %%v31,112(%%r1,%2) \n\t" + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } - int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0; BLASLONG ix = 0, iy = 0; + FLOAT da[2]; if (n <= 0) return (0); @@ -166,8 +122,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG n1 = n & -8; - if (n1) { - zaxpy_kernel_8(n1, x, y, da_r,da_i); + if (n1) { + da[0] = da_r; + da[1] = da_i; + zaxpy_kernel_8(n1, x, y, da); ix = 2 * n1; } i = n1; diff --git a/kernel/zarch/zcopy.c b/kernel/zarch/zcopy.c index b5bf383f7..8c940bba3 100644 --- a/kernel/zarch/zcopy.c +++ b/kernel/zarch/zcopy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -24,71 +24,28 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - + #include "common.h" - -static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { - - __asm__ volatile( - "pfd 1, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],4 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" - - "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" - - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_y] "=m" (*(double (*)[2*n])y), [n_tmp] "+&r"(n) - : [mem_x] "m" (*(const double (*)[2*n])x), [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; +static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "lgr %%r1,%1 \n\t" + "lgr %%r2,%2 \n\t" + "srlg %%r0,%0,4 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1) \n\t" + "pfd 2, 1024(%%r2) \n\t" + "mvc 0(256,%%r2),0(%%r1) \n\t" + "agfi %%r1,256 \n\t" + "agfi %%r2,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","r2" + ); } - int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; @@ -137,9 +94,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) } } - return(0); - + return(0); } - - diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index 61c5d6b98..aab18e2e9 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -23,137 +23,92 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - +*****************************************************************************/ #include "common.h" -#if defined(Z13) - -static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { +static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ __asm__ volatile( - "pfd 1, 0(%[ptr_x_tmp]) \n\t" - "pfd 1, 0(%[ptr_y_tmp]) \n\t" - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "srlg %[n_tmp],%[n_tmp],3 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t" - "pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t" - "vpdi %%v20,%%v16,%%v16,4 \n\t" - "vpdi %%v21,%%v17,%%v17,4 \n\t" - "vpdi %%v22,%%v18,%%v18,4 \n\t" - "vpdi %%v23,%%v19,%%v19,4 \n\t" + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "vzero %%v28 \n\t" + "vzero %%v29 \n\t" + "vzero %%v30 \n\t" + "vzero %%v31 \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vpdi %%v20,%%v16,%%v16,4 \n\t" + "vpdi %%v21,%%v17,%%v17,4 \n\t" + "vpdi %%v22,%%v18,%%v18,4 \n\t" + "vpdi %%v23,%%v19,%%v19,4 \n\t" - "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" - "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" - "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" - "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" - "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" - "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" - "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" - "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25 \n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26 \n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27 \n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28 \n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29 \n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30 \n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31 \n\t" + "vl %%v16, 64(%%r1,%1) \n\t" + "vl %%v17, 80(%%r1,%1) \n\t" + "vl %%v18, 96(%%r1,%1) \n\t" + "vl %%v19, 112(%%r1,%1) \n\t" + "vl %%v0, 64(%%r1,%2) \n\t" + "vl %%v1, 80(%%r1,%2) \n\t" + "vl %%v2, 96(%%r1,%2) \n\t" + "vl %%v3, 112(%%r1,%2) \n\t" + "vpdi %%v20,%%v16,%%v16,4 \n\t" + "vpdi %%v21,%%v17,%%v17,4 \n\t" + "vpdi %%v22,%%v18,%%v18,4 \n\t" + "vpdi %%v23,%%v19,%%v19,4 \n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25 \n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26 \n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27 \n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28 \n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29 \n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30 \n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31 \n\t" - "vl %%v16, 64(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v19,112(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v31,112(%%r1,%[ptr_y_tmp]) \n\t" - "vpdi %%v20,%%v16,%%v16,4 \n\t" - "vpdi %%v21,%%v17,%%v17,4 \n\t" - "vpdi %%v22,%%v18,%%v18,4 \n\t" - "vpdi %%v23,%%v19,%%v19,4 \n\t" - "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" - "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" - "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" - "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" - "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" - "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" - "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" - "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" - - - "la %%r1,128(%%r1) \n\t" - "brctg %[n_tmp],1b \n\t" - "vfadb %%v24,%%v26,%%v24 \n\t" - "vfadb %%v25,%%v25,%%v27 \n\t" - "vsteg %%v24, 0(%[ptr_d]),0 \n\t" - "vsteg %%v24, 8(%[ptr_d]),1 \n\t" - "vsteg %%v25,16(%[ptr_d]),1 \n\t" - "vsteg %%v25,24(%[ptr_d]),0 \n\t" - : [mem_out] "=m"(*(double (*)[4])d ) ,[n_tmp] "+&r"(n) - : [mem_x] "m"( *(const double (*)[2*n])x), - [mem_y] "m"( *(const double (*)[2*n])y), - [ptr_x_tmp] "a"(x), [ptr_y_tmp] "a"(y), [ptr_d] "a"(d) - : "cc", "r1","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vfadb %%v24,%%v24,%%v26 \n\t" + "vfadb %%v24,%%v24,%%v28 \n\t" + "vfadb %%v24,%%v24,%%v30 \n\t" + "vfadb %%v25,%%v25,%%v27 \n\t" + "vfadb %%v25,%%v25,%%v29 \n\t" + "vfadb %%v25,%%v25,%%v31 \n\t" + "vsteg %%v24,0(%3),0 \n\t" + "vsteg %%v24,8(%3),1 \n\t" + "vsteg %%v25,16(%3),1 \n\t" + "vsteg %%v25,24(%3),0 " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else - -static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { - BLASLONG register i = 0; - FLOAT dot[4] = {0.0, 0.0, 0.0, 0.0}; - BLASLONG j = 0; - - while (i < n) { - - dot[0] += x[j] * y[j]; - dot[1] += x[j + 1] * y[j + 1]; - dot[2] += x[j] * y[j + 1]; - dot[3] += x[j + 1] * y[j]; - - dot[0] += x[j + 2] * y[j + 2]; - dot[1] += x[j + 3] * y[j + 3]; - dot[2] += x[j + 2] * y[j + 3]; - dot[3] += x[j + 3] * y[j + 2]; - - dot[0] += x[j + 4] * y[j + 4]; - dot[1] += x[j + 5] * y[j + 5]; - dot[2] += x[j + 4] * y[j + 5]; - dot[3] += x[j + 5] * y[j + 4]; - - dot[0] += x[j + 6] * y[j + 6]; - dot[1] += x[j + 7] * y[j + 7]; - dot[2] += x[j + 6] * y[j + 7]; - dot[3] += x[j + 7] * y[j + 6]; - - j += 8; - i += 4; - - } - d[0] = dot[0]; - d[1] = dot[1]; - d[2] = dot[2]; - d[3] = dot[3]; - -} - -#endif - OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i = 0; - BLASLONG ix=0, iy=0; + BLASLONG i; + BLASLONG ix, iy; OPENBLAS_COMPLEX_FLOAT result; FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; @@ -167,14 +122,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA if ((inc_x == 1) && (inc_y == 1)) { BLASLONG n1 = n & -8; - BLASLONG j=0; - if (n1){ + if (n1) zdot_kernel_8(n1, x, y, dot); - i = n1; - j = n1 <<1; - } - + + i = n1; + BLASLONG j = i * 2; while (i < n) { diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index 380f0140e..75027a06c 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,176 +27,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA) +static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { - __asm__ ( - "pfd 2, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "lgdr %%r1,%[cos] \n\t" - "vlvgp %%v0,%%r1,%%r1 \n\t" - "lgdr %%r1,%[sin] \n\t" - "vlvgp %%v1,%%r1,%%r1 \n\t" - "sllg %[tmp],%[tmp],4 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 48(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v27,112(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v19,112(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 176(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 240(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 240(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 240(%%r1,%[ptr_y]) \n\t" - - "la %%r1,256(%%r1) \n\t" - "clgrjl %%r1,%[tmp],1b \n\t" - : [mem_x] "+m" (*(double (*)[2*n])x), - [mem_y] "+m" (*(double (*)[2*n])y), - [tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA) - : "cc","r1" ,"v0","v1","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - + __asm__ ( + "vlrepg %%v0,%3 \n\t" + "vlrepg %%v1,%4 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%1) \n\t" + "vst %%v29, 16(%%r1,%1) \n\t" + "vst %%v30, 32(%%r1,%1) \n\t" + "vst %%v31, 48(%%r1,%1) \n\t" + "vst %%v20, 0(%%r1,%2) \n\t" + "vst %%v21, 16(%%r1,%2) \n\t" + "vst %%v22, 32(%%r1,%2) \n\t" + "vst %%v23, 48(%%r1,%2) \n\t" + + "vl %%v24, 64(%%r1,%1) \n\t" + "vl %%v25, 80(%%r1,%1) \n\t" + "vl %%v26, 96(%%r1,%1) \n\t" + "vl %%v27, 112(%%r1,%1) \n\t" + "vl %%v16, 64(%%r1,%2) \n\t" + "vl %%v17, 80(%%r1,%2) \n\t" + "vl %%v18, 96(%%r1,%2) \n\t" + "vl %%v19, 112(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%1) \n\t" + "vst %%v29, 80(%%r1,%1) \n\t" + "vst %%v30, 96(%%r1,%1) \n\t" + "vst %%v31, 112(%%r1,%1) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%1) \n\t" + "vst %%v29, 144(%%r1,%1) \n\t" + "vst %%v30, 160(%%r1,%1) \n\t" + "vst %%v31, 176(%%r1,%1) \n\t" + "vst %%v20, 128(%%r1,%2) \n\t" + "vst %%v21, 144(%%r1,%2) \n\t" + "vst %%v22, 160(%%r1,%2) \n\t" + "vst %%v23, 176(%%r1,%2) \n\t" + + "vl %%v24, 192(%%r1,%1) \n\t" + "vl %%v25, 208(%%r1,%1) \n\t" + "vl %%v26, 224(%%r1,%1) \n\t" + "vl %%v27, 240(%%r1,%1) \n\t" + "vl %%v16, 192(%%r1,%2) \n\t" + "vl %%v17, 208(%%r1,%2) \n\t" + "vl %%v18, 224(%%r1,%2) \n\t" + "vl %%v19, 240(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 192(%%r1,%1) \n\t" + "vst %%v29, 208(%%r1,%1) \n\t" + "vst %%v30, 224(%%r1,%1) \n\t" + "vst %%v31, 240(%%r1,%1) \n\t" + "vst %%v20, 192(%%r1,%2) \n\t" + "vst %%v21, 208(%%r1,%2) \n\t" + "vst %%v22, 224(%%r1,%2) \n\t" + "vst %%v23, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) @@ -214,8 +204,11 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT BLASLONG n1 = n & -16; if ( n1 > 0 ) - { - zrot_kernel_16(n1, x, y, c, s); + { + FLOAT cosa,sina; + cosa=c; + sina=s; + zrot_kernel_16(n1, x, y, &cosa, &sina); i=n1; ix=2*n1; } @@ -234,6 +227,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } + } else { @@ -259,3 +253,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } + diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index 4764c0a52..4d8ee960f 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -23,270 +23,211 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ #include "common.h" +static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlrepg %%v0,0(%1) \n\t" + "vleg %%v1,8(%1),0 \n\t" + "wflcdb %%v1,%%v1 \n\t" + "vleg %%v1,8(%1),1 \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vpdi %%v24,%%v16,%%v16,4 \n\t" + "vpdi %%v25,%%v17,%%v17,4 \n\t" + "vpdi %%v26,%%v18,%%v18,4 \n\t" + "vpdi %%v27,%%v19,%%v19,4 \n\t" + "vpdi %%v28,%%v20,%%v20,4 \n\t" + "vpdi %%v29,%%v21,%%v21,4 \n\t" + "vpdi %%v30,%%v22,%%v22,4 \n\t" + "vpdi %%v31,%%v23,%%v23,4 \n\t" + + "vfmdb %%v16,%%v16,%%v0 \n\t" + "vfmdb %%v17,%%v17,%%v0 \n\t" + "vfmdb %%v18,%%v18,%%v0 \n\t" + "vfmdb %%v19,%%v19,%%v0 \n\t" + "vfmdb %%v20,%%v20,%%v0 \n\t" + "vfmdb %%v21,%%v21,%%v0 \n\t" + "vfmdb %%v22,%%v22,%%v0 \n\t" + "vfmdb %%v23,%%v23,%%v0 \n\t" + "vfmadb %%v16,%%v24,%%v1,%%v16 \n\t" + "vfmadb %%v17,%%v25,%%v1,%%v17 \n\t" + "vfmadb %%v18,%%v26,%%v1,%%v18 \n\t" + "vfmadb %%v19,%%v27,%%v1,%%v19 \n\t" + "vfmadb %%v20,%%v28,%%v1,%%v20 \n\t" + "vfmadb %%v21,%%v29,%%v1,%%v21 \n\t" + "vfmadb %%v22,%%v30,%%v1,%%v22 \n\t" + "vfmadb %%v23,%%v31,%%v1,%%v23 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} +static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vleg %%v0,8(%1),0 \n\t" + "wflcdb %%v0,%%v0 \n\t" + "vleg %%v0,8(%1),1 \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" -static void zscal_kernel_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x) { - BLASLONG tempR1 ; - __asm__ ( - "pfd 2, 0(%[x_tmp]) \n\t" -#if !defined(CONJ) - "lgdr %[t1],%[alpha_r] \n\t" - "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint - "lgdr %[t1],%[alpha_i] \n\t" - "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint - "vflcdb %%v29,%%v29 \n\t" //complement both - "vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i} + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vpdi %%v16,%%v16,%%v16,4 \n\t" + "vpdi %%v17,%%v17,%%v17,4 \n\t" + "vpdi %%v18,%%v18,%%v18,4 \n\t" + "vpdi %%v19,%%v19,%%v19,4 \n\t" + "vpdi %%v20,%%v20,%%v20,4 \n\t" + "vpdi %%v21,%%v21,%%v21,4 \n\t" + "vpdi %%v22,%%v22,%%v22,4 \n\t" + "vpdi %%v23,%%v23,%%v23,4 \n\t" -#else - "lgdr %[t1],%[alpha_i] \n\t" - "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint - "lgdr %[t1],%[alpha_r] \n\t" - "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint - "vflcdb %%v28,%%v28 \n\t" //complement both - "vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r} -#endif - - "xgr %[t1],%[t1] \n\t" - "sllg %[tmp],%[tmp],4 \n\t" - "vl %%v20 , 0(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 16(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 32(%[t1],%[x_tmp]) \n\t" - "vl %%v23 , 48(%[t1],%[x_tmp]) \n\t" - - "lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition - "j 2f \n\t" - ".align 16 \n\t" - "1: \n\t" - - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - "vfmdb %%v16, %%v20, %%v28 \n\t" - "vfmdb %%v17, %%v21, %%v28 \n\t" - "vfmdb %%v18, %%v22, %%v28 \n\t" - "vfmdb %%v19, %%v23, %%v28 \n\t" - "vl %%v20, 64(%[t1],%[x_tmp]) \n\t" - "vl %%v21, 80(%[t1],%[x_tmp]) \n\t" - "vl %%v22, 96(%[t1],%[x_tmp]) \n\t" - "vl %%v23, 112(%[t1],%[x_tmp]) \n\t" - "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" - "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" - "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" - "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" + "vfmdb %%v16,%%v16,%%v0 \n\t" + "vfmdb %%v17,%%v17,%%v0 \n\t" + "vfmdb %%v18,%%v18,%%v0 \n\t" + "vfmdb %%v19,%%v19,%%v0 \n\t" + "vfmdb %%v20,%%v20,%%v0 \n\t" + "vfmdb %%v21,%%v21,%%v0 \n\t" + "vfmdb %%v22,%%v22,%%v0 \n\t" + "vfmdb %%v23,%%v23,%%v0 \n\t" + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" - "vst %%v16 , 0(%[t1],%[x_tmp]) \n\t" - "vst %%v17 , 16(%[t1],%[x_tmp]) \n\t" - "vst %%v18 , 32(%[t1],%[x_tmp]) \n\t" - "vst %%v19 , 48(%[t1],%[x_tmp]) \n\t" + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); +} + +static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlrepg %%v0,0(%1) \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfmdb %%v16,%%v16,%%v0 \n\t" + "vfmdb %%v17,%%v17,%%v0 \n\t" + "vfmdb %%v18,%%v18,%%v0 \n\t" + "vfmdb %%v19,%%v19,%%v0 \n\t" + "vfmdb %%v20,%%v20,%%v0 \n\t" + "vfmdb %%v21,%%v21,%%v0 \n\t" + "vfmdb %%v22,%%v22,%%v0 \n\t" + "vfmdb %%v23,%%v23,%%v0 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); +} + +static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) +{ + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + + "vst %%v24,0(%%r1,%1) \n\t" + "vst %%v25,16(%%r1,%1) \n\t" + "vst %%v26,32(%%r1,%1) \n\t" + "vst %%v27,48(%%r1,%1) \n\t" + "vst %%v24,64(%%r1,%1) \n\t" + "vst %%v25,80(%%r1,%1) \n\t" + "vst %%v26,96(%%r1,%1) \n\t" + "vst %%v27,112(%%r1,%1) \n\t" - "la %[t1],64(%[t1] ) \n\t" - "2: \n\t" - "pfd 2, 256(%[t1],%[x_tmp]) \n\t" - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - - "vfmdb %%v30, %%v20, %%v28 \n\t" - "vfmdb %%v31, %%v21, %%v28 \n\t" - "vfmdb %%v6, %%v22, %%v28 \n\t" - "vfmdb %%v7, %%v23, %%v28 \n\t" - - "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" - "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" - - "vfmadb %%v30, %%v24, %%v29, %%v30 \n\t" - "vfmadb %%v31, %%v25, %%v29, %%v31 \n\t" - "vfmadb %%v6, %%v26, %%v29, %%v6 \n\t" - "vfmadb %%v7, %%v27, %%v29, %%v7 \n\t" - - - "vst %%v30 , 0(%[t1],%[x_tmp]) \n\t" - "vst %%v31 , 16(%[t1],%[x_tmp]) \n\t" - "vst %%v6 , 32(%[t1],%[x_tmp]) \n\t" - "vst %%v7 , 48(%[t1],%[x_tmp]) \n\t" - - "la %[t1],64(%[t1] ) \n\t" - - - "clgrjl %[t1],%[tmp],1b \n\t" -//---------------------------------------------------------------------- - "vfmdb %%v16, %%v20, %%v28 \n\t" - "vfmdb %%v17, %%v21, %%v28 \n\t" - "vfmdb %%v18, %%v22, %%v28 \n\t" - "vfmdb %%v19, %%v23, %%v28 \n\t" - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" - "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" - "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" - "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" - - "vst %%v16 , 0(%[t1],%[x_tmp]) \n\t" - "vst %%v17 , 16(%[t1],%[x_tmp]) \n\t" - "vst %%v18 , 32(%[t1],%[x_tmp]) \n\t" - "vst %%v19 , 48(%[t1],%[x_tmp]) \n\t" - - : [mem_x] "+m" (*(double (*)[2*n])x),[tmp]"+&r"(n) , [t1] "=&a" (tempR1) - : [x_tmp] "a"(x), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) - : "cc", "v6","v7", "v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - - -} - -static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT da_i, FLOAT *x) { - - __asm__ ( "pfd 2, 0(%1) \n\t" - "lgdr %%r0,%[alpha] \n\t" - "vlvgp %%v16,%%r0,%%r0 \n\t" //load both from disjoint - "vflcdb %%v16,%%v16 \n\t" //complement both - "vlvgg %%v16,%%r0,0 \n\t" //restore 1st - "vlr %%v17 ,%%v16 \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "vl %%v24, 0(%[x_ptr]) \n\t" - "vfmdb %%v24,%%v24,%%v16 \n\t" - "vsteg %%v24, 0(%[x_ptr]),1 \n\t" - "vsteg %%v24, 8(%[x_ptr]),0 \n\t" - "vl %%v25, 16(%[x_ptr]) \n\t" - "vfmdb %%v25,%%v25,%%v17 \n\t" - "vsteg %%v25, 16(%[x_ptr]),1 \n\t" - "vsteg %%v25, 24(%[x_ptr]),0 \n\t" - "vl %%v26, 32(%[x_ptr]) \n\t" - "vfmdb %%v26,%%v26,%%v16 \n\t" - "vsteg %%v26, 32(%[x_ptr]),1 \n\t" - "vsteg %%v26, 40(%[x_ptr]),0 \n\t" - "vl %%v27, 48(%[x_ptr]) \n\t" - "vfmdb %%v27,%%v27,%%v17 \n\t" - "vsteg %%v27, 48(%[x_ptr]),1 \n\t" - "vsteg %%v27, 56(%[x_ptr]),0 \n\t" - "vl %%v28, 64(%[x_ptr]) \n\t" - "vfmdb %%v28,%%v28,%%v16 \n\t" - "vsteg %%v28, 64(%[x_ptr]),1 \n\t" - "vsteg %%v28, 72(%[x_ptr]),0 \n\t" - "vl %%v29, 80(%[x_ptr]) \n\t" - "vfmdb %%v29,%%v29,%%v17 \n\t" - "vsteg %%v29, 80(%[x_ptr]),1 \n\t" - "vsteg %%v29, 88(%[x_ptr]),0 \n\t" - "vl %%v30, 96(%[x_ptr]) \n\t" - "vfmdb %%v30,%%v30,%%v16 \n\t" - "vsteg %%v30, 96(%[x_ptr]),1 \n\t" - "vsteg %%v30, 104(%[x_ptr]),0 \n\t" - "vl %%v31, 112(%[x_ptr]) \n\t" - "vfmdb %%v31,%%v31,%%v17 \n\t" - "vsteg %%v31, 112(%[x_ptr]),1 \n\t" - "vsteg %%v31, 120(%[x_ptr]),0 \n\t" - "la %[x_ptr],128(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x) - : [n] "r"(n),[alpha] "f"(da_i) - :"cc", "r0","f0", "f1","v16","v17" ,"v24","v25","v26","v27","v28","v29","v30","v31" - ); - - + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v24","v25","v26","v27" + ); } -static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT da_r, FLOAT *x) { - __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" - "lgdr %%r0,%[alpha] \n\t" - "vlvgp %%v18,%%r0,%%r0 \n\t" - "vlr %%v19,%%v18 \n\t" - "vlr %%v16,%%v18 \n\t" - "vlr %%v17,%%v18 \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "vl %%v24, 0(%[x_ptr]) \n\t" - "vfmdb %%v24,%%v24,%%v18 \n\t" - "vst %%v24, 0(%[x_ptr]) \n\t" - "vl %%v25, 16(%[x_ptr]) \n\t" - "vfmdb %%v25,%%v25,%%v19 \n\t" - "vst %%v25, 16(%[x_ptr]) \n\t" - "vl %%v26, 32(%[x_ptr]) \n\t" - "vfmdb %%v26,%%v26,%%v16 \n\t" - "vst %%v26, 32(%[x_ptr]) \n\t" - "vl %%v27, 48(%[x_ptr]) \n\t" - "vfmdb %%v27,%%v27,%%v17 \n\t" - "vst %%v27, 48(%[x_ptr]) \n\t" - "vl %%v28, 64(%[x_ptr]) \n\t" - "vfmdb %%v28,%%v28,%%v18 \n\t" - "vst %%v28, 64(%[x_ptr]) \n\t" - "vl %%v29, 80(%[x_ptr]) \n\t" - "vfmdb %%v29,%%v29,%%v19 \n\t" - "vst %%v29, 80(%[x_ptr]) \n\t" - "vl %%v30, 96(%[x_ptr]) \n\t" - "vfmdb %%v30,%%v30,%%v16 \n\t" - "vst %%v30, 96(%[x_ptr]) \n\t" - "vl %%v31,112(%[x_ptr]) \n\t" - "vfmdb %%v31,%%v31,%%v17 \n\t" - "vst %%v31,112(%[x_ptr]) \n\t" - "la %[x_ptr],128(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x) - : [n] "r"(n),[alpha] "f"(da_r) - : "cc", "r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31" - ); - -} - -static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { - - __asm__ ( "pfd 2, 0(%[x_ptr]) \n\t" - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256( %[x_ptr]) \n\t" - "vst %%v24, 0( %[x_ptr]) \n\t" - "vst %%v25, 16( %[x_ptr]) \n\t" - "vst %%v26, 32( %[x_ptr]) \n\t" - "vst %%v27, 48( %[x_ptr]) \n\t" - "vst %%v24, 64( %[x_ptr]) \n\t" - "vst %%v25, 80( %[x_ptr]) \n\t" - "vst %%v26, 96( %[x_ptr]) \n\t" - "vst %%v27,112( %[x_ptr]) \n\t" - - "la %[x_ptr],128(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "+m" (*(double (*)[2*n])x),[x_ptr] "+&a"(x) - : [n] "r"(n) - :"cc" ,"r0","v24","v25","v26","v27" - ); - -} - - - - - -static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x) { - +static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) +{ BLASLONG i; BLASLONG inc_x2 = 2 * inc_x; BLASLONG inc_x3 = inc_x2 + inc_x; - FLOAT t0, t1, t2, t3; + FLOAT t0, t1, t2, t3; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; - for (i = 0; i < n; i += 4) { + for (i = 0; i < n; i += 4) + { t0 = da_r * x[0] - da_i * x[1]; t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; @@ -303,17 +244,14 @@ static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLAS x[inc_x3] = t3; x += 4 * inc_x; - } - - } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0, j = 0; FLOAT temp0; FLOAT temp1; - + FLOAT alpha[2] __attribute__ ((aligned(16))); if (inc_x != 1) { inc_x <<= 1; @@ -405,8 +343,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } else { BLASLONG n1 = n & -8; - if (n1 > 0) { - zscal_kernel_inc_8(n1, da_r,da_i, x, inc_x); + if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; + zscal_kernel_inc_8(n1, alpha, x, inc_x); j = n1; i = n1 * inc_x; } @@ -432,17 +372,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG n1 = n & -8; if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; if (da_r == 0.0) if (da_i == 0) zscal_kernel_8_zero(n1, x); else - zscal_kernel_8_zero_r(n1, da_i, x); + zscal_kernel_8_zero_r(n1, alpha, x); else if (da_i == 0) - zscal_kernel_8_zero_i(n1, da_r, x); + zscal_kernel_8_zero_i(n1, alpha, x); else - zscal_kernel_8(n1, da_r,da_i, x); + zscal_kernel_8(n1, alpha, x); i = n1 << 1; j = n1; @@ -508,5 +450,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, return (0); } - - diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index 062079002..a16b87cdc 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -25,220 +25,93 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" - -#if defined(Z13_SWAP_A) -static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { - __asm__ volatile( - "pfd 1, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],4 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v16, 0(%%r1,%[ptr_x]) \n\t" + __asm__ volatile( + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 16(%%r1,%[ptr_x]) \n\t" + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vl %%v4, 64(%%r1,%2) \n\t" + "vl %%v5, 80(%%r1,%2) \n\t" + "vl %%v6, 96(%%r1,%2) \n\t" + "vl %%v7, 112(%%r1,%2) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v1, 16(%%r1,%1) \n\t" + "vst %%v2, 32(%%r1,%1) \n\t" + "vst %%v3, 48(%%r1,%1) \n\t" + "vst %%v4, 64(%%r1,%1) \n\t" + "vst %%v5, 80(%%r1,%1) \n\t" + "vst %%v6, 96(%%r1,%1) \n\t" + "vst %%v7, 112(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 32(%%r1,%[ptr_x]) \n\t" - - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 48(%%r1,%[ptr_x]) \n\t" - - "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_x]) \n\t" - - "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_x]) \n\t" - - "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_x]) \n\t" - - "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_x]) \n\t" - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v16, 128(%%r1,%[ptr_x]) \n\t" - - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 144(%%r1,%[ptr_x]) \n\t" - - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 160(%%r1,%[ptr_x]) \n\t" - - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 176(%%r1,%[ptr_x]) \n\t" - - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 192(%%r1,%[ptr_x]) \n\t" - - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 208(%%r1,%[ptr_x]) \n\t" - - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 224(%%r1,%[ptr_x]) \n\t" - - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 240(%%r1,%[ptr_x]) \n\t" - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[2*n])x), - [mem_y] "+m" (*(double (*)[2*n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23" - ,"v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; + "vl %%v0, 128(%%r1,%2) \n\t" + "vl %%v1, 144(%%r1,%2) \n\t" + "vl %%v2, 160(%%r1,%2) \n\t" + "vl %%v3, 176(%%r1,%2) \n\t" + "vl %%v4, 192(%%r1,%2) \n\t" + "vl %%v5, 208(%%r1,%2) \n\t" + "vl %%v6, 224(%%r1,%2) \n\t" + "vl %%v7, 240(%%r1,%2) \n\t" + "vst %%v0, 128(%%r1,%1) \n\t" + "vst %%v1, 144(%%r1,%1) \n\t" + "vst %%v2, 160(%%r1,%1) \n\t" + "vst %%v3, 176(%%r1,%1) \n\t" + "vst %%v4, 192(%%r1,%1) \n\t" + "vst %%v5, 208(%%r1,%1) \n\t" + "vst %%v6, 224(%%r1,%1) \n\t" + "vst %%v7, 240(%%r1,%1) \n\t" + "vst %%v16, 0(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else - -static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "pfd 2, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],4 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v16, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - - - "vl %%v0, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v1, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v2, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v3, 48(%%r1,%[ptr_y]) \n\t" - "vl %%v4, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v5, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v6, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v7, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v0, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v1, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v2, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v3, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v4, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v5, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v6, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v7, 112(%%r1,%[ptr_x]) \n\t" - - "vl %%v0, 128(%%r1,%[ptr_y]) \n\t" - "vl %%v1, 144(%%r1,%[ptr_y]) \n\t" - "vl %%v2, 160(%%r1,%[ptr_y]) \n\t" - "vl %%v3, 176(%%r1,%[ptr_y]) \n\t" - "vl %%v4, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v5, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v6, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v7, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v0, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v1, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v2, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v3, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v4, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v5, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v6, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v7, 240(%%r1,%[ptr_x]) \n\t" - - "vst %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[2*n])x), - [mem_y] "+m" (*(double (*)[2*n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - -} - -#endif - - - - - - int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; diff --git a/ztest/Makefile b/ztest/Makefile new file mode 100644 index 000000000..0ff7fe46a --- /dev/null +++ b/ztest/Makefile @@ -0,0 +1,437 @@ +TOPDIR = .. +include $(TOPDIR)/Makefile.system + +goto :: sdot.goto ddot.goto cdot.goto zdot.goto dsdot.goto sswap.goto dswap.goto cswap.goto zswap.goto isamax.goto idamax.goto icamax.goto izamax.goto samax.goto damax.goto ismax.goto idmax.goto smax.goto dmax.goto isamin.goto idamin.goto icamin.goto izamin.goto samin.goto damin.goto camin.goto zamin.goto ismin.goto idmin.goto smin.goto dmin.goto sgemv.goto dgemv.goto cgemv.goto zgemv.goto sscal.goto dscal.goto cscal.goto zscal.goto saxpy.goto daxpy.goto caxpy.goto zaxpy.goto srot.goto drot.goto crot.goto zrot.goto sasum.goto dasum.goto casum.goto zasum.goto scopy.goto dcopy.goto ccopy.goto zcopy.goto + +##################################### Sdot #################################################### +sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Ddot #################################################### +ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Cdot #################################################### +cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zdot #################################################### +zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dsdot #################################################### +dsdot.goto : dsdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISAMAX ############################################## +isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDAMAX ############################################## +idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ICAMAX ############################################## +icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IZAMAX ############################################## +izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SAMAX ############################################## +samax.goto : samax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DAMAX ############################################## +damax.goto : damax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISMAX ############################################## +ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDMAX ############################################## +idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SMAX ############################################## +smax.goto : smax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DMAX ############################################## +dmax.goto : dmax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISAMIN ############################################## +isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDAMIN ############################################## +idamin.goto : idamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ICAMIN ############################################## +icamin.goto : icamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IZAMIN ############################################## +izamin.goto : izamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SAMIN ############################################## +samin.goto : samin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DAMIN ############################################## +damin.goto : damin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## CAMIN ############################################## +camin.goto : camin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ZAMIN ############################################## +zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISMIN ############################################## +ismin.goto : ismin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDMIN ############################################## +idmin.goto : idmin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SMIN ############################################## +smin.goto : smin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DMIN ############################################## +dmin.goto : dmin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Sgemv #################################################### +sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dgemv #################################################### +dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Cgemv #################################################### + +cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zgemv #################################################### + +zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Sscal #################################################### +sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dscal #################################################### +dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Cscal #################################################### + +cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zscal #################################################### + +zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Saxpy #################################################### +saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Daxpy #################################################### +daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Caxpy #################################################### + +caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zaxpy #################################################### + +zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Srot #################################################### +srot.goto : srot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Drot #################################################### +drot.goto : drot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Crot #################################################### +crot.goto : crot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zrot #################################################### +zrot.goto : zrot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Sswap #################################################### +sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dswap #################################################### +dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Cswap #################################################### + +cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zswap #################################################### + +zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Saxpy #################################################### +saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Daxpy #################################################### +daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Caxpy #################################################### + +caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zaxpy #################################################### + +zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Sasum #################################################### +sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dasum #################################################### +dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Casum #################################################### + +casum.goto : casum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zasum #################################################### + +zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Scopy #################################################### +scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dcopy #################################################### +dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Ccopy #################################################### + +ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zcopy #################################################### + +zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +################################################################################################### + +sdot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +ddot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cdot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +dsdot.$(SUFFIX) : dsdot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +isamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +icamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +izamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +samax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +damax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ismax.$(SUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idmax.$(SUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +smax.$(SUFFIX) : max.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dmax.$(SUFFIX) : max.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +isamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +icamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +izamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +samin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +damin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +camin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zamin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +ismin.$(SUFFIX) : imin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idmin.$(SUFFIX) : imin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +smin.$(SUFFIX) : min.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dmin.$(SUFFIX) : min.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +sgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +saxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +daxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +caxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +srot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +drot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +crot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zrot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +saxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +daxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +caxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +casum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +scopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dcopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ccopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zcopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +clean :: + @rm -f *.goto + diff --git a/ztest/amax.c b/ztest/amax.c new file mode 100644 index 000000000..f2e3f5411 --- /dev/null +++ b/ztest/amax.c @@ -0,0 +1,235 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +FLOAT amax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + maxf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) > maxf ) + { + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(maxf); +} + +#undef AMAX +#ifdef DOUBLE +#define AMAX BLASFUNC(damax) +#else +#define AMAX BLASFUNC(samax) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +FLOAT amin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + minf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) < minf ) + { + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(minf); +} + +#undef AMIN +#ifdef DOUBLE +#define AMIN BLASFUNC(damin) +#else +#define AMIN BLASFUNC(samin) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#ifdef COMPLEX +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +FLOAT zasum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + sumf += CABS1(x,i); + i += inc_x2; + } + return(sumf); +} +#else +FLOAT asum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + return(sumf); +} +#endif + +#undef ASUM +#ifdef COMPLEX +#ifdef DOUBLE +#define ASUM BLASFUNC(dzasum) +#else +#define ASUM BLASFUNC(scasum) +#endif +#else +#ifdef DOUBLE +#define ASUM BLASFUNC(dasum) +#else +#define ASUM BLASFUNC(sasum) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zaxpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + if ( da_r == 0.0 && da_i == 0.0 ) return(0); + + ix = 0; + iy = 0; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} +#else +int axpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + + if ( n < 0 ) return(0); + if ( da == 0.0 ) return(0); + + ix = 0; + iy = 0; + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} +#endif + +#undef AXPY +#ifdef COMPLEX +#ifdef DOUBLE +#define AXPY BLASFUNC(zaxpy) +#else +#define AXPY BLASFUNC(caxpy) +#endif +#else +#ifdef DOUBLE +#define AXPY BLASFUNC(daxpy) +#else +#define AXPY BLASFUNC(saxpy) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y, *y_c;; + FLOAT alpha[2] = { 2.0, 2.0 }; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + argc--;argv++; + + blasint iy; + int test = 1; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zcopy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2; + iy += inc_y2; + i++ ; + + } + return(0); + +} +#else +int copy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n < 0 ) return(0); + + while(i < n) + { + + y[iy] = x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} +#endif + +#undef COPY +#ifdef COMPLEX +#ifdef DOUBLE +#define COPY BLASFUNC(zcopy) +#else +#define COPY BLASFUNC(ccopy) +#endif +#else +#ifdef DOUBLE +#define COPY BLASFUNC(dcopy) +#else +#define COPY BLASFUNC(scopy) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y, *y_c; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint iy; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +OPENBLAS_COMPLEX_FLOAT zdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT dot[2]; + OPENBLAS_COMPLEX_FLOAT result; + BLASLONG inc_x2; + BLASLONG inc_y2; + + dot[0]=0.0; + dot[1]=0.0; + + CREAL(result) = 0.0 ; + CIMAG(result) = 0.0 ; + + if ( n < 1 ) return(result); + + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + + while(i < n) + { +#if !defined(CONJ) + dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ; + dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ; +#else + dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ; + dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + CREAL(result) = dot[0]; + CIMAG(result) = dot[1]; + return(result); + +} +#else +FLOAT dot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT dot = 0.0 ; + + if ( n < 0 ) return(dot); + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); +} +#endif + +#undef DOT +#ifdef COMPLEX +#ifdef DOUBLE +#define DOT BLASFUNC(zdotu) +#else +#define DOT BLASFUNC(cdotu) +#endif +#else +#ifdef DOUBLE +#define DOT BLASFUNC(ddot) +#else +#define DOT BLASFUNC(sdot) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; +#ifdef COMPLEX + OPENBLAS_COMPLEX_FLOAT result, result_c; +#else + FLOAT result, result_c; +#endif + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +double dsdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + double dot = 0.0 ; + + if ( n < 0 ) return(dot); + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); +} + +#undef DSDOT +#define DSDOT BLASFUNC(dsdot) + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; + double result, result_c; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zgemv_n_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp_r,temp_i; + BLASLONG inc_x2,inc_y2; + BLASLONG lda2; + BLASLONG i2; + + lda2 = 2*lda; + + ix = 0; + a_ptr = a; + + if ( inc_x == 1 && inc_y == 1 ) + { + + for (j=0; jtv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x, *y, *y_c; + FLOAT alpha[] = {1.0, 1.0}; + FLOAT beta [] = {1.0, 1.0}; + char trans='N'; + blasint m, i, j; + blasint inc_x=1,inc_y=1; + blasint n=0; + int has_param_n = 0; + int has_param_m = 0; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint iy; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + + int tomax = to; + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; + if ((p = getenv("OPENBLAS_PARAM_N"))) { + n = atoi(p); + if ((n>0)) has_param_n = 1; + if ( n > tomax ) tomax = n; + } + if ( has_param_n == 0 ) + if ((p = getenv("OPENBLAS_PARAM_M"))) { + m = atoi(p); + if ((m>0)) has_param_m = 1; + if ( m > tomax ) tomax = m; + } + + + + fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + if (has_param_m == 0) + { + + for(m = from; m <= to; m += step) + { + timeg=0; + timeg_c=0; + if ( has_param_n == 0 ) n = m; + fprintf(stderr, " %6dx%d :", (int)m,(int)n); + for(j = 0; j < m; j++){ + for(i = 0; i < n * COMPSIZE; i++){ + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#ifdef COMPLEX +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +BLASLONG izamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf; + BLASLONG max=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(max+1); +} +#else +BLASLONG iamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) > maxf ) + { + max = i; + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(max+1); +} +#endif + +#undef IAMAX +#ifdef COMPLEX +#ifdef DOUBLE +#define IAMAX BLASFUNC(izamax) +#else +#define IAMAX BLASFUNC(icamax) +#endif +#else +#ifdef DOUBLE +#define IAMAX BLASFUNC(idamax) +#else +#define IAMAX BLASFUNC(isamax) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + BLASLONG result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#ifdef COMPLEX +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +BLASLONG izamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf; + BLASLONG min=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(min); + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(min+1); +} +#else +BLASLONG iamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) < minf ) + { + min = i; + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(min+1); +} +#endif + +#undef IAMIN +#ifdef COMPLEX +#ifdef DOUBLE +#define IAMIN BLASFUNC(izamin) +#else +#define IAMIN BLASFUNC(icamin) +#endif +#else +#ifdef DOUBLE +#define IAMIN BLASFUNC(idamin) +#else +#define IAMIN BLASFUNC(isamin) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + BLASLONG result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +BLASLONG imax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + max = i; + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(max+1); +} + +#undef IMAX +#ifdef DOUBLE +#define IMAX BLASFUNC(idmax) +#else +#define IMAX BLASFUNC(ismax) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + BLASLONG result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +BLASLONG imin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] < minf ) + { + min = i; + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(min+1); +} + +#undef IMIN +#ifdef DOUBLE +#define IMIN BLASFUNC(idmin) +#else +#define IMIN BLASFUNC(ismin) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + BLASLONG result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +FLOAT max_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(maxf); +} + +#undef MAX_ +#ifdef DOUBLE +#define MAX_ BLASFUNC(dmax) +#else +#define MAX_ BLASFUNC(smax) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +FLOAT min_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] < minf ) + { + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(minf); +} + +#undef MIN_ +#ifdef DOUBLE +#define MIN_ BLASFUNC(dmin) +#else +#define MIN_ BLASFUNC(smin) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zrot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n <= 0 ) return(0); + + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); +} +#else +int rot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n <= 0 ) return(0); + + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); +} +#endif + +#undef ROT +#ifdef COMPLEX +#ifdef DOUBLE +#define ROT BLASFUNC(zdrot) +#else +#define ROT BLASFUNC(csrot) +#endif +#else +#ifdef DOUBLE +#define ROT BLASFUNC(drot) +#else +#define ROT BLASFUNC(srot) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y, *x_c, *y_c; + // FLOAT result; + blasint m, i; + blasint inc_x=1,inc_y=1; + FLOAT c[1] = { 2.0 }; + FLOAT s[1] = { 2.0 }; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint ix,iy; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zscal_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG inc_x2; + BLASLONG ip = 0; + FLOAT temp; + + if ( (n <= 0) || (inc_x <= 0)) + return(0); + + inc_x2 = 2 * inc_x; + for ( i=0; itv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *x_c; + FLOAT alpha[2] = { 2.0, 2.0 }; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint ix; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zswap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); +} +#else +int swap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n < 0 ) return(0); + + while(i < n) + { + + temp = x[ix] ; + x[ix] = y[iy] ; + y[iy] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); +} +#endif + +#undef SWAP +#ifdef COMPLEX +#ifdef DOUBLE +#define SWAP BLASFUNC(zswap) +#else +#define SWAP BLASFUNC(cswap) +#endif +#else +#ifdef DOUBLE +#define SWAP BLASFUNC(dswap) +#else +#define SWAP BLASFUNC(sswap) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y, *x_c, *y_c; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint ix,iy; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l