From 23229011db2ab03c7643f1e0a007efc8e0276201 Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 6 Aug 2018 18:20:40 +0300 Subject: [PATCH 01/26] [ZARCH] Z14 support, BLAS 1/2 single precision implementations, Some missing double precision implementations, Gemv optimization --- Makefile.zarch | 4 + cpuid_zarch.c | 35 +- kernel/zarch/KERNEL.Z13 | 20 +- kernel/zarch/KERNEL.Z14 | 146 +++++++ kernel/zarch/camax.c | 269 +++++++++++++ kernel/zarch/camin.c | 269 +++++++++++++ kernel/zarch/casum.c | 167 ++++++++ kernel/zarch/caxpy.c | 174 +++++++++ kernel/zarch/ccopy.c | 99 +++++ kernel/zarch/cdot.c | 182 +++++++++ kernel/zarch/crot.c | 256 ++++++++++++ kernel/zarch/cscal.c | 456 +++++++++++++++++++++ kernel/zarch/cswap.c | 183 +++++++++ kernel/zarch/damax.c | 206 ++++++++++ kernel/zarch/damin.c | 206 ++++++++++ kernel/zarch/dasum.c | 152 +++---- kernel/zarch/daxpy.c | 175 ++++----- kernel/zarch/dcopy.c | 122 +----- kernel/zarch/ddot.c | 151 +++---- kernel/zarch/dgemv_n_4.c | 490 ++++++++++++++++------- kernel/zarch/dgemv_t_4.c | 574 ++++++++++++++++++++------- kernel/zarch/dmax.c | 182 +++++++++ kernel/zarch/dmin.c | 182 +++++++++ kernel/zarch/drot.c | 338 ++++++++-------- kernel/zarch/dscal.c | 198 ++++------ kernel/zarch/dsdot.c | 180 +++++++++ kernel/zarch/dswap.c | 286 ++++---------- kernel/zarch/icamax.c | 319 +++++++++++++++ kernel/zarch/icamin.c | 319 +++++++++++++++ kernel/zarch/idamax.c | 289 +++++++------- kernel/zarch/idamin.c | 299 +++++++------- kernel/zarch/idmax.c | 232 +++++++++++ kernel/zarch/idmin.c | 232 +++++++++++ kernel/zarch/isamax.c | 299 ++++++++++++++ kernel/zarch/isamin.c | 299 ++++++++++++++ kernel/zarch/ismax.c | 275 +++++++++++++ kernel/zarch/ismin.c | 275 +++++++++++++ kernel/zarch/izamax.c | 322 +++++++-------- kernel/zarch/izamin.c | 392 +++++++++---------- kernel/zarch/samax.c | 210 ++++++++++ kernel/zarch/samin.c | 210 ++++++++++ kernel/zarch/sasum.c | 174 +++++++++ kernel/zarch/saxpy.c | 184 +++++++++ kernel/zarch/scopy.c | 85 ++++ kernel/zarch/sdot.c | 140 +++++++ kernel/zarch/sgemv_n_4.c | 668 +++++++++++++++++++++++++++++++ kernel/zarch/sgemv_t_4.c | 826 +++++++++++++++++++++++++++++++++++++++ kernel/zarch/smax.c | 186 +++++++++ kernel/zarch/smin.c | 186 +++++++++ kernel/zarch/srot.c | 246 ++++++++++++ kernel/zarch/sscal.c | 201 ++++++++++ kernel/zarch/sswap.c | 164 ++++++++ kernel/zarch/zamax.c | 221 +++++++++++ kernel/zarch/zamin.c | 221 +++++++++++ kernel/zarch/zasum.c | 152 +++---- kernel/zarch/zaxpy.c | 198 ++++------ kernel/zarch/zcopy.c | 86 +--- kernel/zarch/zdot.c | 203 ++++------ kernel/zarch/zrot.c | 339 ++++++++-------- kernel/zarch/zscal.c | 460 ++++++++++------------ kernel/zarch/zswap.c | 285 ++++---------- ztest/Makefile | 437 +++++++++++++++++++++ ztest/amax.c | 235 +++++++++++ ztest/amin.c | 235 +++++++++++ ztest/asum.c | 263 +++++++++++++ ztest/axpy.c | 303 ++++++++++++++ ztest/copy.c | 291 ++++++++++++++ ztest/dot.c | 296 ++++++++++++++ ztest/dsdot.c | 229 +++++++++++ ztest/gemv.c | 618 +++++++++++++++++++++++++++++ ztest/iamax.c | 284 ++++++++++++++ ztest/iamin.c | 284 ++++++++++++++ ztest/imax.c | 231 +++++++++++ ztest/imin.c | 231 +++++++++++ ztest/max.c | 229 +++++++++++ ztest/min.c | 229 +++++++++++ ztest/rot.c | 303 ++++++++++++++ ztest/scal.c | 308 +++++++++++++++ ztest/swap.c | 306 +++++++++++++++ 79 files changed, 17314 insertions(+), 2897 deletions(-) create mode 100644 kernel/zarch/KERNEL.Z14 create mode 100644 kernel/zarch/camax.c create mode 100644 kernel/zarch/camin.c create mode 100644 kernel/zarch/casum.c create mode 100644 kernel/zarch/caxpy.c create mode 100644 kernel/zarch/ccopy.c create mode 100644 kernel/zarch/cdot.c create mode 100644 kernel/zarch/crot.c create mode 100644 kernel/zarch/cscal.c create mode 100644 kernel/zarch/cswap.c create mode 100644 kernel/zarch/damax.c create mode 100644 kernel/zarch/damin.c create mode 100644 kernel/zarch/dmax.c create mode 100644 kernel/zarch/dmin.c create mode 100644 kernel/zarch/dsdot.c create mode 100644 kernel/zarch/icamax.c create mode 100644 kernel/zarch/icamin.c create mode 100644 kernel/zarch/idmax.c create mode 100644 kernel/zarch/idmin.c create mode 100644 kernel/zarch/isamax.c create mode 100644 kernel/zarch/isamin.c create mode 100644 kernel/zarch/ismax.c create mode 100644 kernel/zarch/ismin.c create mode 100644 kernel/zarch/samax.c create mode 100644 kernel/zarch/samin.c create mode 100644 kernel/zarch/sasum.c create mode 100644 kernel/zarch/saxpy.c create mode 100644 kernel/zarch/scopy.c create mode 100644 kernel/zarch/sdot.c create mode 100644 kernel/zarch/sgemv_n_4.c create mode 100644 kernel/zarch/sgemv_t_4.c create mode 100644 kernel/zarch/smax.c create mode 100644 kernel/zarch/smin.c create mode 100644 kernel/zarch/srot.c create mode 100644 kernel/zarch/sscal.c create mode 100644 kernel/zarch/sswap.c create mode 100644 kernel/zarch/zamax.c create mode 100644 kernel/zarch/zamin.c create mode 100644 ztest/Makefile create mode 100644 ztest/amax.c create mode 100644 ztest/amin.c create mode 100644 ztest/asum.c create mode 100644 ztest/axpy.c create mode 100644 ztest/copy.c create mode 100644 ztest/dot.c create mode 100644 ztest/dsdot.c create mode 100644 ztest/gemv.c create mode 100644 ztest/iamax.c create mode 100644 ztest/iamin.c create mode 100644 ztest/imax.c create mode 100644 ztest/imin.c create mode 100644 ztest/max.c create mode 100644 ztest/min.c create mode 100644 ztest/rot.c create mode 100644 ztest/scal.c create mode 100644 ztest/swap.c diff --git a/Makefile.zarch b/Makefile.zarch index 9ec9dc79f..47ea1eb71 100644 --- a/Makefile.zarch +++ b/Makefile.zarch @@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector FCOMMON_OPT += -march=z13 -mzvector endif +ifeq ($(CORE), Z14) +CCOMMON_OPT += -march=z14 -mzvector +FCOMMON_OPT += -march=z14 -mzvector +endif diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 4e1935429..0ae32f27d 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -29,40 +29,25 @@ #define CPU_GENERIC 0 #define CPU_Z13 1 +#define CPU_Z14 2 static char *cpuname[] = { "ZARCH_GENERIC", - "Z13" + "Z13", + "Z14" }; static char *cpuname_lower[] = { "zarch_generic", - "z13" + "z13", + "z14" }; int detect(void) { - FILE *infile; - char buffer[512], *p; - - p = (char *)NULL; - infile = fopen("/proc/sysinfo", "r"); - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("Type", buffer, 4)){ - p = strchr(buffer, ':') + 2; -#if 0 - fprintf(stderr, "%s\n", p); -#endif - break; - } - } - - fclose(infile); - - if (strstr(p, "2964")) return CPU_Z13; - if (strstr(p, "2965")) return CPU_Z13; - - return CPU_GENERIC; + // return CPU_GENERIC; + return CPU_Z14; + } void get_libname(void) @@ -107,5 +92,9 @@ void get_cpuconfig(void) printf("#define Z13\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); break; + case CPU_Z14: + printf("#define Z14\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + break; } } diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13 index add628bfe..d39b9d904 100644 --- a/kernel/zarch/KERNEL.Z13 +++ b/kernel/zarch/KERNEL.Z13 @@ -1,18 +1,18 @@ SAMAXKERNEL = ../arm/amax.c -DAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = damax.c CAMAXKERNEL = ../arm/zamax.c -ZAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = zamax.c SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c +DAMINKERNEL = damin.c CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = zamin.c SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c +DMAXKERNEL = dmax.c SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c +DMINKERNEL = dmin.c ISAMAXKERNEL = ../arm/iamax.c IDAMAXKERNEL = idamax.c @@ -25,10 +25,10 @@ ICAMINKERNEL = ../arm/izamin.c IZAMINKERNEL = izamin.c ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = idmax.c ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c +IDMINKERNEL = idmin.c SASUMKERNEL = ../arm/asum.c DASUMKERNEL = dasum.c @@ -74,12 +74,12 @@ ZSWAPKERNEL = zswap.c SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = dgemv_n_4.c CGEMVNKERNEL = ../arm/zgemv_n.c -ZGEMVNKERNEL = zgemv_n_4.c +ZGEMVNKERNEL = ../arm/zgemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c DGEMVTKERNEL = dgemv_t_4.c CGEMVTKERNEL = ../arm/zgemv_t.c -ZGEMVTKERNEL = zgemv_t_4.c +ZGEMVTKERNEL = ../arm/zgemv_t.c STRMMKERNEL = strmm8x4V.S DTRMMKERNEL = trmm8x4V.S diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14 new file mode 100644 index 000000000..fa88b6881 --- /dev/null +++ b/kernel/zarch/KERNEL.Z14 @@ -0,0 +1,146 @@ +SAMAXKERNEL = samax.c +DAMAXKERNEL = damax.c +CAMAXKERNEL = camax.c +ZAMAXKERNEL = zamax.c + +SAMINKERNEL = samin.c +DAMINKERNEL = damin.c +CAMINKERNEL = camin.c +ZAMINKERNEL = zamin.c + +SMAXKERNEL = smax.c +DMAXKERNEL = dmax.c + +SMINKERNEL = smin.c +DMINKERNEL = dmin.c + +ISAMAXKERNEL = isamax.c +IDAMAXKERNEL = idamax.c +ICAMAXKERNEL = icamax.c +IZAMAXKERNEL = izamax.c + +ISAMINKERNEL = isamin.c +IDAMINKERNEL = idamin.c +ICAMINKERNEL = icamin.c +IZAMINKERNEL = izamin.c + +ISMAXKERNEL = ismax.c +IDMAXKERNEL = idmax.c + +ISMINKERNEL = ismin.c +IDMINKERNEL = idmin.c + +SASUMKERNEL = sasum.c +DASUMKERNEL = dasum.c +CASUMKERNEL = casum.c +ZASUMKERNEL = zasum.c + +SAXPYKERNEL = saxpy.c +DAXPYKERNEL = daxpy.c +CAXPYKERNEL = caxpy.c +ZAXPYKERNEL = zaxpy.c + +SCOPYKERNEL = scopy.c +DCOPYKERNEL = dcopy.c +CCOPYKERNEL = ccopy.c +ZCOPYKERNEL = zcopy.c + +SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c +DSDOTKERNEL = dsdot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = srot.c +DROTKERNEL = drot.c +CROTKERNEL = crot.c +ZROTKERNEL = zrot.c + +SSCALKERNEL = sscal.c +DSCALKERNEL = dscal.c +CSCALKERNEL = cscal.c +ZSCALKERNEL = zscal.c + +SSWAPKERNEL = sswap.c +DSWAPKERNEL = dswap.c +CSWAPKERNEL = cswap.c +ZSWAPKERNEL = zswap.c + +SGEMVNKERNEL = sgemv_n_4.c +DGEMVNKERNEL = dgemv_n_4.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = sgemv_t_4.c +DGEMVTKERNEL = dgemv_t_4.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +STRMMKERNEL = strmm8x4V.S +DTRMMKERNEL = trmm8x4V.S +CTRMMKERNEL = ctrmm4x4V.S +ZTRMMKERNEL = ztrmm4x4V.S + +SGEMMKERNEL = strmm8x4V.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + + + +DGEMMKERNEL = gemm8x4V.S +DGEMMINCOPY = ../generic/gemm_ncopy_8.c +DGEMMITCOPY = ../generic/gemm_tcopy_8.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ctrmm4x4V.S +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ztrmm4x4V.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + + + diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c new file mode 100644 index 000000000..6394be769 --- /dev/null +++ b/kernel/zarch/camax.c @@ -0,0 +1,269 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vlef %%v0,0(%2),0 \n\t" + "vlef %%v16,4(%2),0 \n\t" + "vlef %%v0,8(%2),1 \n\t" + "vlef %%v16,12(%2),1 \n\t" + "vlef %%v0,16(%2),2 \n\t" + "vlef %%v16,20(%2),2 \n\t" + "vlef %%v0,24(%2),3 \n\t" + "vlef %%v16,28(%2),3 \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vflpsb %%v16,%%v16 \n\t" + "vfasb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%2) \n\t" + + "vlef %%v16,0(%%r1,%2),0 \n\t" + "vlef %%v17,4(%%r1,%2),0 \n\t" + "vlef %%v16,8(%%r1,%2),1 \n\t" + "vlef %%v17,12(%%r1,%2),1 \n\t" + "vlef %%v16,16(%%r1,%2),2 \n\t" + "vlef %%v17,20(%%r1,%2),2 \n\t" + "vlef %%v16,24(%%r1,%2),3 \n\t" + "vlef %%v17,28(%%r1,%2),3 \n\t" + + "vlef %%v18,32(%%r1,%2),0 \n\t" + "vlef %%v19,36(%%r1,%2),0 \n\t" + "vlef %%v18,40(%%r1,%2),1 \n\t" + "vlef %%v19,44(%%r1,%2),1 \n\t" + "vlef %%v18,48(%%r1,%2),2 \n\t" + "vlef %%v19,52(%%r1,%2),2 \n\t" + "vlef %%v18,56(%%r1,%2),3 \n\t" + "vlef %%v19,30(%%r1,%2),3 \n\t" + + "vlef %%v20,64(%%r1,%2),0 \n\t" + "vlef %%v21,68(%%r1,%2),0 \n\t" + "vlef %%v20,72(%%r1,%2),1 \n\t" + "vlef %%v21,76(%%r1,%2),1 \n\t" + "vlef %%v20,80(%%r1,%2),2 \n\t" + "vlef %%v21,84(%%r1,%2),2 \n\t" + "vlef %%v20,88(%%r1,%2),3 \n\t" + "vlef %%v21,92(%%r1,%2),3 \n\t" + + "vlef %%v22,96(%%r1,%2),0 \n\t" + "vlef %%v23,100(%%r1,%2),0 \n\t" + "vlef %%v22,104(%%r1,%2),1 \n\t" + "vlef %%v23,108(%%r1,%2),1 \n\t" + "vlef %%v22,112(%%r1,%2),2 \n\t" + "vlef %%v23,116(%%r1,%2),2 \n\t" + "vlef %%v22,120(%%r1,%2),3 \n\t" + "vlef %%v23,124(%%r1,%2),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v24,%%v16,%%v17 \n\t" + "vfchsb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchsb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchsb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vlef %%v16,128(%%r1,%2),0 \n\t" + "vlef %%v17,132(%%r1,%2),0 \n\t" + "vlef %%v16,136(%%r1,%2),1 \n\t" + "vlef %%v17,140(%%r1,%2),1 \n\t" + "vlef %%v16,144(%%r1,%2),2 \n\t" + "vlef %%v17,148(%%r1,%2),2 \n\t" + "vlef %%v16,152(%%r1,%2),3 \n\t" + "vlef %%v17,156(%%r1,%2),3 \n\t" + + "vlef %%v18,160(%%r1,%2),0 \n\t" + "vlef %%v19,164(%%r1,%2),0 \n\t" + "vlef %%v18,168(%%r1,%2),1 \n\t" + "vlef %%v19,172(%%r1,%2),1 \n\t" + "vlef %%v18,176(%%r1,%2),2 \n\t" + "vlef %%v19,180(%%r1,%2),2 \n\t" + "vlef %%v18,184(%%r1,%2),3 \n\t" + "vlef %%v19,188(%%r1,%2),3 \n\t" + + "vlef %%v20,192(%%r1,%2),0 \n\t" + "vlef %%v21,196(%%r1,%2),0 \n\t" + "vlef %%v20,200(%%r1,%2),1 \n\t" + "vlef %%v21,204(%%r1,%2),1 \n\t" + "vlef %%v20,208(%%r1,%2),2 \n\t" + "vlef %%v21,212(%%r1,%2),2 \n\t" + "vlef %%v20,216(%%r1,%2),3 \n\t" + "vlef %%v21,220(%%r1,%2),3 \n\t" + + "vlef %%v22,224(%%r1,%2),0 \n\t" + "vlef %%v23,228(%%r1,%2),0 \n\t" + "vlef %%v22,232(%%r1,%2),1 \n\t" + "vlef %%v23,236(%%r1,%2),1 \n\t" + "vlef %%v22,240(%%r1,%2),2 \n\t" + "vlef %%v23,244(%%r1,%2),2 \n\t" + "vlef %%v22,248(%%r1,%2),3 \n\t" + "vlef %%v23,252(%%r1,%2),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v24,%%v16,%%v17 \n\t" + "vfchsb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchsb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchsb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + maxf = camax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=CABS1(x,0); + i++; + } + + while (i < n) { + if (ABS(x[i*2]) > maxf) { + maxf = ABS(x[i*2]); + } + i++; + } + return (maxf); + + } else { + + inc_x2 = 2 * inc_x; + maxf=CABS1(x,0); + i += inc_x2; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (CABS1(x,i) > maxf) { + maxf = CABS1(x,i); + } + if (CABS1(x,i+inc_x2) > maxf) { + maxf = CABS1(x,i+inc_x2); + } + if (CABS1(x,i+inc_x2*2) > maxf) { + maxf = CABS1(x,i+inc_x2*2); + } + if (CABS1(x,i+inc_x2*3) > maxf) { + maxf = CABS1(x,i+inc_x2*3); + } + + i += inc_x2 * 4; + + j += 4; + + } + + + while (j < n) { + if (CABS1(x,i) > maxf) { + maxf = CABS1(x,i); + } + i += inc_x2; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c new file mode 100644 index 000000000..936c300c8 --- /dev/null +++ b/kernel/zarch/camin.c @@ -0,0 +1,269 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vlef %%v0,0(%2),0 \n\t" + "vlef %%v16,4(%2),0 \n\t" + "vlef %%v0,8(%2),0 \n\t" + "vlef %%v16,12(%2),0 \n\t" + "vlef %%v0,16(%2),2 \n\t" + "vlef %%v16,20(%2),2 \n\t" + "vlef %%v0,24(%2),3 \n\t" + "vlef %%v16,28(%2),3 \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vflpsb %%v16,%%v16 \n\t" + "vfasb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vlef %%v16,0(%%r1,%2),0 \n\t" + "vlef %%v17,4(%%r1,%2),0 \n\t" + "vlef %%v16,8(%%r1,%2),0 \n\t" + "vlef %%v17,12(%%r1,%2),0 \n\t" + "vlef %%v16,16(%%r1,%2),2 \n\t" + "vlef %%v17,20(%%r1,%2),2 \n\t" + "vlef %%v16,24(%%r1,%2),3 \n\t" + "vlef %%v17,28(%%r1,%2),3 \n\t" + + "vlef %%v18,32(%%r1,%2),0 \n\t" + "vlef %%v19,36(%%r1,%2),0 \n\t" + "vlef %%v18,40(%%r1,%2),0 \n\t" + "vlef %%v19,44(%%r1,%2),0 \n\t" + "vlef %%v18,48(%%r1,%2),2 \n\t" + "vlef %%v19,52(%%r1,%2),2 \n\t" + "vlef %%v18,56(%%r1,%2),3 \n\t" + "vlef %%v19,30(%%r1,%2),3 \n\t" + + "vlef %%v20,64(%%r1,%2),0 \n\t" + "vlef %%v21,68(%%r1,%2),0 \n\t" + "vlef %%v20,72(%%r1,%2),0 \n\t" + "vlef %%v21,76(%%r1,%2),0 \n\t" + "vlef %%v20,80(%%r1,%2),2 \n\t" + "vlef %%v21,84(%%r1,%2),2 \n\t" + "vlef %%v20,88(%%r1,%2),3 \n\t" + "vlef %%v21,92(%%r1,%2),3 \n\t" + + "vlef %%v22,96(%%r1,%2),0 \n\t" + "vlef %%v23,100(%%r1,%2),0 \n\t" + "vlef %%v22,104(%%r1,%2),0 \n\t" + "vlef %%v23,108(%%r1,%2),0 \n\t" + "vlef %%v22,112(%%r1,%2),2 \n\t" + "vlef %%v23,116(%%r1,%2),2 \n\t" + "vlef %%v22,120(%%r1,%2),3 \n\t" + "vlef %%v23,124(%%r1,%2),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchsb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchsb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vlef %%v16,128(%%r1,%2),0 \n\t" + "vlef %%v17,132(%%r1,%2),0 \n\t" + "vlef %%v16,136(%%r1,%2),0 \n\t" + "vlef %%v17,140(%%r1,%2),0 \n\t" + "vlef %%v16,144(%%r1,%2),2 \n\t" + "vlef %%v17,148(%%r1,%2),2 \n\t" + "vlef %%v16,152(%%r1,%2),3 \n\t" + "vlef %%v17,156(%%r1,%2),3 \n\t" + + "vlef %%v18,160(%%r1,%2),0 \n\t" + "vlef %%v19,164(%%r1,%2),0 \n\t" + "vlef %%v18,168(%%r1,%2),0 \n\t" + "vlef %%v19,172(%%r1,%2),0 \n\t" + "vlef %%v18,176(%%r1,%2),2 \n\t" + "vlef %%v19,180(%%r1,%2),2 \n\t" + "vlef %%v18,184(%%r1,%2),3 \n\t" + "vlef %%v19,188(%%r1,%2),3 \n\t" + + "vlef %%v20,192(%%r1,%2),0 \n\t" + "vlef %%v21,196(%%r1,%2),0 \n\t" + "vlef %%v20,200(%%r1,%2),0 \n\t" + "vlef %%v21,204(%%r1,%2),0 \n\t" + "vlef %%v20,208(%%r1,%2),2 \n\t" + "vlef %%v21,212(%%r1,%2),2 \n\t" + "vlef %%v20,216(%%r1,%2),3 \n\t" + "vlef %%v21,220(%%r1,%2),3 \n\t" + + "vlef %%v22,224(%%r1,%2),0 \n\t" + "vlef %%v23,228(%%r1,%2),0 \n\t" + "vlef %%v22,232(%%r1,%2),0 \n\t" + "vlef %%v23,236(%%r1,%2),0 \n\t" + "vlef %%v22,240(%%r1,%2),2 \n\t" + "vlef %%v23,244(%%r1,%2),2 \n\t" + "vlef %%v22,248(%%r1,%2),3 \n\t" + "vlef %%v23,252(%%r1,%2),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchsb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchsb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = camin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=CABS1(x,0); + i++; + } + + while (i < n) { + if (ABS(x[i*2]) < minf) { + minf = ABS(x[i*2]); + } + i++; + } + return (minf); + + } else { + + inc_x2 = 2 * inc_x; + minf=CABS1(x,0); + i += inc_x2; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (CABS1(x,i) < minf) { + minf = CABS1(x,i); + } + if (CABS1(x,i+inc_x2) < minf) { + minf = CABS1(x,i+inc_x2); + } + if (CABS1(x,i+inc_x2*2) < minf) { + minf = CABS1(x,i+inc_x2*2); + } + if (CABS1(x,i+inc_x2*3) < minf) { + minf = CABS1(x,i+inc_x2*3); + } + + i += inc_x2 * 4; + + j += 4; + + } + + + while (j < n) { + if (CABS1(x,i) < minf) { + minf = CABS1(x,i); + } + i += inc_x2; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/casum.c b/kernel/zarch/casum.c new file mode 100644 index 000000000..f4ebc21bd --- /dev/null +++ b/kernel/zarch/casum.c @@ -0,0 +1,167 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT asum; + + __asm__ ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + "vl %%v20, 64(%%r1,%2) \n\t" + "vl %%v21, 80(%%r1,%2) \n\t" + "vl %%v22, 96(%%r1,%2) \n\t" + "vl %%v23, 112(%%r1,%2) \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfasb %%v0,%%v0,%%v16 \n\t" + "vfasb %%v1,%%v1,%%v17 \n\t" + "vfasb %%v2,%%v2,%%v18 \n\t" + "vfasb %%v3,%%v3,%%v19 \n\t" + "vfasb %%v0,%%v0,%%v20 \n\t" + "vfasb %%v1,%%v1,%%v21 \n\t" + "vfasb %%v2,%%v2,%%v22 \n\t" + "vfasb %%v3,%%v3,%%v23 \n\t" + + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + "vl %%v20, 192(%%r1,%2) \n\t" + "vl %%v21, 208(%%r1,%2) \n\t" + "vl %%v22, 224(%%r1,%2) \n\t" + "vl %%v23, 240(%%r1,%2) \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfasb %%v0,%%v0,%%v16 \n\t" + "vfasb %%v1,%%v1,%%v17 \n\t" + "vfasb %%v2,%%v2,%%v18 \n\t" + "vfasb %%v3,%%v3,%%v19 \n\t" + "vfasb %%v0,%%v0,%%v20 \n\t" + "vfasb %%v1,%%v1,%%v21 \n\t" + "vfasb %%v2,%%v2,%%v22 \n\t" + "vfasb %%v3,%%v3,%%v23 \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vfasb %%v0,%%v0,%%v2 \n\t" + "vfasb %%v0,%%v0,%%v3 \n\t" + "veslg %%v1,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vrepf %%v1,%%v0,2 \n\t" + "aebr %%f0,%%f1 \n\t" + "ler %0,%%f0 " + :"=f"(asum) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" + ); + + return asum; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ip=0; + FLOAT sumf = 0.0; + BLASLONG n1; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) + { + + n1 = n & -32; + if ( n1 > 0 ) + { + + sumf = casum_kernel_32(n1, x); + i=n1; + ip=2*n1; + } + + while(i < n) + { + sumf += ABS(x[ip]) + ABS(x[ip+1]); + i++; + ip+=2; + } + + } + else + { + inc_x2 = 2* inc_x; + + while(i < n) + { + sumf += ABS(x[ip]) + ABS(x[ip+1]); + ip+=inc_x2; + i++; + } + + } + return(sumf); +} + + diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c new file mode 100644 index 000000000..2176f3dcd --- /dev/null +++ b/kernel/zarch/caxpy.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile( +#if !defined(CONJ) + "vlrepf %%v0,0(%3) \n\t" + "vlef %%v1,4(%3),0 \n\t" + "vlef %%v1,4(%3),2 \n\t" + "vflcsb %%v1,%%v1 \n\t" + "vlef %%v1,4(%3),1 \n\t" + "vlef %%v1,4(%3),3 \n\t" +#else + "vlef %%v0,0(%3),1 \n\t" + "vlef %%v0,0(%3),3 \n\t" + "vflcsb %%v0,%%v0 \n\t" + "vlef %%v0,0(%3),0 \n\t" + "vlef %%v0,0(%3),2 \n\t" + "vlrepf %%v1,4(%3) \n\t" +#endif + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,0(%%r1,%2) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,32(%%r1,%2) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "verllg %%v24,%%v16,32 \n\t" + "verllg %%v25,%%v17,32 \n\t" + "verllg %%v26,%%v18,32 \n\t" + "verllg %%v27,%%v19,32 \n\t" + + "vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" + + "vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" + + "vst %%v28,0(%%r1,%2) \n\t" + "vst %%v29,16(%%r1,%2) \n\t" + "vst %%v30,32(%%r1,%2) \n\t" + "vst %%v31,48(%%r1,%2) \n\t" + + "vl %%v16,64(%%r1,%1) \n\t" + "vl %%v17,80(%%r1,%1) \n\t" + "vl %%v18,96(%%r1,%1) \n\t" + "vl %%v19,112(%%r1,%1) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "verllg %%v24,%%v16,32 \n\t" + "verllg %%v25,%%v17,32 \n\t" + "verllg %%v26,%%v18,32 \n\t" + "verllg %%v27,%%v19,32 \n\t" + + "vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" + + "vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" + + "vst %%v28,64(%%r1,%2) \n\t" + "vst %%v29,80(%%r1,%2) \n\t" + "vst %%v30,96(%%r1,%2) \n\t" + "vst %%v31,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT da[2]; + + if (n <= 0) return (0); + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -16; + + if (n1) { + da[0] = da_r; + da[1] = da_i; + caxpy_kernel_16(n1, x, y, da); + ix = 2 * n1; + } + i = n1; + while (i < n) { +#if !defined(CONJ) + y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); + y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); +#else + y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); + y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); +#endif + i++; + ix += 2; + + } + return (0); + + + } + + inc_x *= 2; + inc_y *= 2; + + while (i < n) { + +#if !defined(CONJ) + y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); + y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); +#else + y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); + y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); +#endif + ix += inc_x; + iy += inc_y; + i++; + + } + return (0); + +} + + diff --git a/kernel/zarch/ccopy.c b/kernel/zarch/ccopy.c new file mode 100644 index 000000000..fc0b8d648 --- /dev/null +++ b/kernel/zarch/ccopy.c @@ -0,0 +1,99 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "lgr %%r1,%1 \n\t" + "lgr %%r2,%2 \n\t" + "srlg %%r0,%0,5 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1) \n\t" + "pfd 2, 1024(%%r2) \n\t" + "mvc 0(256,%%r2),0(%%r1) \n\t" + "agfi %%r1,256 \n\t" + "agfi %%r2,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","r2" + ); +} + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1 )) + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + ccopy_kernel_32(n1, x, y); + i=n1; + ix=n1*2; + iy=n1*2; + } + + while(i < n) + { + y[iy] = x[iy] ; + y[iy+1] = x[ix+1] ; + ix+=2; + iy+=2; + i++ ; + + } + + + } + else + { + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + + return(0); +} diff --git a/kernel/zarch/cdot.c b/kernel/zarch/cdot.c new file mode 100644 index 000000000..3eda2979b --- /dev/null +++ b/kernel/zarch/cdot.c @@ -0,0 +1,182 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "vzero %%v28 \n\t" + "vzero %%v29 \n\t" + "vzero %%v30 \n\t" + "vzero %%v31 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "verllg %%v20,%%v16,32 \n\t" + "verllg %%v21,%%v17,32 \n\t" + "verllg %%v22,%%v18,32 \n\t" + "verllg %%v23,%%v19,32 \n\t" + + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25 \n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26 \n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27 \n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28 \n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29 \n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30 \n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31 \n\t" + + "vl %%v16, 64(%%r1,%1) \n\t" + "vl %%v17, 80(%%r1,%1) \n\t" + "vl %%v18, 96(%%r1,%1) \n\t" + "vl %%v19, 112(%%r1,%1) \n\t" + "vl %%v0, 64(%%r1,%2) \n\t" + "vl %%v1, 80(%%r1,%2) \n\t" + "vl %%v2, 96(%%r1,%2) \n\t" + "vl %%v3, 112(%%r1,%2) \n\t" + "verllg %%v20,%%v16,32 \n\t" + "verllg %%v21,%%v17,32 \n\t" + "verllg %%v22,%%v18,32 \n\t" + "verllg %%v23,%%v19,32 \n\t" + + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25 \n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26 \n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27 \n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28 \n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29 \n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30 \n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vfasb %%v24,%%v24,%%v26 \n\t" + "vfasb %%v24,%%v24,%%v28 \n\t" + "vfasb %%v24,%%v24,%%v30 \n\t" + "vrepg %%v26,%%v24,1 \n\t" + "vfasb %%v24,%%v24,%%v26 \n\t" + "vfasb %%v25,%%v25,%%v27 \n\t" + "vfasb %%v25,%%v25,%%v29 \n\t" + "vfasb %%v25,%%v25,%%v31 \n\t" + "vrepg %%v27,%%v25,1 \n\t" + "vfasb %%v25,%%v25,%%v27 \n\t" + "vstef %%v24,0(%3),0 \n\t" + "vstef %%v24,4(%3),1 \n\t" + "vstef %%v25,8(%3),1 \n\t" + "vstef %%v25,12(%3),0 " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i; + BLASLONG ix, iy; + OPENBLAS_COMPLEX_FLOAT result; + FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; + + if (n <= 0) { + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + return (result); + + } + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -16; + + if (n1) + cdot_kernel_16(n1, x, y, dot); + + i = n1; + BLASLONG j = i * 2; + + while (i < n) { + + dot[0] += x[j] * y[j]; + dot[1] += x[j + 1] * y[j + 1]; + dot[2] += x[j] * y[j + 1]; + dot[3] += x[j + 1] * y[j]; + + j += 2; + i++; + + } + + + } else { + i = 0; + ix = 0; + iy = 0; + inc_x <<= 1; + inc_y <<= 1; + while (i < n) { + + dot[0] += x[ix] * y[iy]; + dot[1] += x[ix + 1] * y[iy + 1]; + dot[2] += x[ix] * y[iy + 1]; + dot[3] += x[ix + 1] * y[iy]; + + ix += inc_x; + iy += inc_y; + i++; + + } + } + +#if !defined(CONJ) + CREAL(result) = dot[0] - dot[1]; + CIMAG(result) = dot[2] + dot[3]; +#else + CREAL(result) = dot[0] + dot[1]; + CIMAG(result) = dot[2] - dot[3]; + +#endif + + return (result); + +} + + diff --git a/kernel/zarch/crot.c b/kernel/zarch/crot.c new file mode 100644 index 000000000..f04a624ac --- /dev/null +++ b/kernel/zarch/crot.c @@ -0,0 +1,256 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) +{ + __asm__ ( + "vlrepf %%v0,%3 \n\t" + "vlrepf %%v1,%4 \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%1) \n\t" + "vst %%v29, 16(%%r1,%1) \n\t" + "vst %%v30, 32(%%r1,%1) \n\t" + "vst %%v31, 48(%%r1,%1) \n\t" + "vst %%v20, 0(%%r1,%2) \n\t" + "vst %%v21, 16(%%r1,%2) \n\t" + "vst %%v22, 32(%%r1,%2) \n\t" + "vst %%v23, 48(%%r1,%2) \n\t" + + "vl %%v24, 64(%%r1,%1) \n\t" + "vl %%v25, 80(%%r1,%1) \n\t" + "vl %%v26, 96(%%r1,%1) \n\t" + "vl %%v27, 112(%%r1,%1) \n\t" + "vl %%v16, 64(%%r1,%2) \n\t" + "vl %%v17, 80(%%r1,%2) \n\t" + "vl %%v18, 96(%%r1,%2) \n\t" + "vl %%v19, 112(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%1) \n\t" + "vst %%v29, 80(%%r1,%1) \n\t" + "vst %%v30, 96(%%r1,%1) \n\t" + "vst %%v31, 112(%%r1,%1) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%1) \n\t" + "vst %%v29, 144(%%r1,%1) \n\t" + "vst %%v30, 160(%%r1,%1) \n\t" + "vst %%v31, 176(%%r1,%1) \n\t" + "vst %%v20, 128(%%r1,%2) \n\t" + "vst %%v21, 144(%%r1,%2) \n\t" + "vst %%v22, 160(%%r1,%2) \n\t" + "vst %%v23, 176(%%r1,%2) \n\t" + + "vl %%v24, 192(%%r1,%1) \n\t" + "vl %%v25, 208(%%r1,%1) \n\t" + "vl %%v26, 224(%%r1,%1) \n\t" + "vl %%v27, 240(%%r1,%1) \n\t" + "vl %%v16, 192(%%r1,%2) \n\t" + "vl %%v17, 208(%%r1,%2) \n\t" + "vl %%v18, 224(%%r1,%2) \n\t" + "vl %%v19, 240(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 192(%%r1,%1) \n\t" + "vst %%v29, 208(%%r1,%1) \n\t" + "vst %%v30, 224(%%r1,%1) \n\t" + "vst %%v31, 240(%%r1,%1) \n\t" + "vst %%v20, 192(%%r1,%2) \n\t" + "vst %%v21, 208(%%r1,%2) \n\t" + "vst %%v22, 224(%%r1,%2) \n\t" + "vst %%v23, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + FLOAT cosa,sina; + cosa=c; + sina=s; + crot_kernel_32(n1, x, y, &cosa, &sina); + i=n1; + ix=2*n1; + } + + while(i < n) + { + temp[0] = c*x[ix] + s*y[ix] ; + temp[1] = c*x[ix+1] + s*y[ix+1] ; + y[ix] = c*y[ix] - s*x[ix] ; + y[ix+1] = c*y[ix+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += 2 ; + i++ ; + + } + + + } + else + { + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + +} + + diff --git a/kernel/zarch/cscal.c b/kernel/zarch/cscal.c new file mode 100644 index 000000000..0c15c5add --- /dev/null +++ b/kernel/zarch/cscal.c @@ -0,0 +1,456 @@ +/*************************************************************************** +Copyright (c) 2013 - 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlrepf %%v0,0(%1) \n\t" + "vlef %%v1,4(%1),0 \n\t" + "vlef %%v1,4(%1),2 \n\t" + "vflcsb %%v1,%%v1 \n\t" + "vlef %%v1,4(%1),1 \n\t" + "vlef %%v1,4(%1),3 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "verllg %%v24,%%v16,32 \n\t" + "verllg %%v25,%%v17,32 \n\t" + "verllg %%v26,%%v18,32 \n\t" + "verllg %%v27,%%v19,32 \n\t" + "verllg %%v28,%%v20,32 \n\t" + "verllg %%v29,%%v21,32 \n\t" + "verllg %%v30,%%v22,32 \n\t" + "verllg %%v31,%%v23,32 \n\t" + + "vfmsb %%v16,%%v16,%%v0 \n\t" + "vfmsb %%v17,%%v17,%%v0 \n\t" + "vfmsb %%v18,%%v18,%%v0 \n\t" + "vfmsb %%v19,%%v19,%%v0 \n\t" + "vfmsb %%v20,%%v20,%%v0 \n\t" + "vfmsb %%v21,%%v21,%%v0 \n\t" + "vfmsb %%v22,%%v22,%%v0 \n\t" + "vfmsb %%v23,%%v23,%%v0 \n\t" + "vfmasb %%v16,%%v24,%%v1,%%v16 \n\t" + "vfmasb %%v17,%%v25,%%v1,%%v17 \n\t" + "vfmasb %%v18,%%v26,%%v1,%%v18 \n\t" + "vfmasb %%v19,%%v27,%%v1,%%v19 \n\t" + "vfmasb %%v20,%%v28,%%v1,%%v20 \n\t" + "vfmasb %%v21,%%v29,%%v1,%%v21 \n\t" + "vfmasb %%v22,%%v30,%%v1,%%v22 \n\t" + "vfmasb %%v23,%%v31,%%v1,%%v23 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlef %%v0,4(%1),0 \n\t" + "vlef %%v0,4(%1),2 \n\t" + "vflcsb %%v0,%%v0 \n\t" + "vlef %%v0,4(%1),1 \n\t" + "vlef %%v0,4(%1),3 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "verllg %%v16,%%v16,32 \n\t" + "verllg %%v17,%%v17,32 \n\t" + "verllg %%v18,%%v18,32 \n\t" + "verllg %%v19,%%v19,32 \n\t" + "verllg %%v20,%%v20,32 \n\t" + "verllg %%v21,%%v21,32 \n\t" + "verllg %%v22,%%v22,32 \n\t" + "verllg %%v23,%%v23,32 \n\t" + + "vfmsb %%v16,%%v16,%%v0 \n\t" + "vfmsb %%v17,%%v17,%%v0 \n\t" + "vfmsb %%v18,%%v18,%%v0 \n\t" + "vfmsb %%v19,%%v19,%%v0 \n\t" + "vfmsb %%v20,%%v20,%%v0 \n\t" + "vfmsb %%v21,%%v21,%%v0 \n\t" + "vfmsb %%v22,%%v22,%%v0 \n\t" + "vfmsb %%v23,%%v23,%%v0 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); +} + +static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlrepf %%v0,0(%1) \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfmsb %%v16,%%v16,%%v0 \n\t" + "vfmsb %%v17,%%v17,%%v0 \n\t" + "vfmsb %%v18,%%v18,%%v0 \n\t" + "vfmsb %%v19,%%v19,%%v0 \n\t" + "vfmsb %%v20,%%v20,%%v0 \n\t" + "vfmsb %%v21,%%v21,%%v0 \n\t" + "vfmsb %%v22,%%v22,%%v0 \n\t" + "vfmsb %%v23,%%v23,%%v0 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); +} + +static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) +{ + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + + "vst %%v24,0(%%r1,%1) \n\t" + "vst %%v25,16(%%r1,%1) \n\t" + "vst %%v26,32(%%r1,%1) \n\t" + "vst %%v27,48(%%r1,%1) \n\t" + "vst %%v24,64(%%r1,%1) \n\t" + "vst %%v25,80(%%r1,%1) \n\t" + "vst %%v26,96(%%r1,%1) \n\t" + "vst %%v27,112(%%r1,%1) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v24","v25","v26","v27" + ); +} + +static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_x3 = inc_x2 + inc_x; + FLOAT t0, t1, t2, t3; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + + for (i = 0; i < n; i += 4) + { + t0 = da_r * x[0] - da_i * x[1]; + t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; + t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; + t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; + + x[1] = da_i * x[0] + da_r * x[1]; + x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; + x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; + x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; + + x[0] = t0; + x[inc_x] = t1; + x[inc_x2] = t2; + x[inc_x3] = t3; + + x += 4 * inc_x; + } +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + FLOAT temp0; + FLOAT temp1; + FLOAT alpha[2] __attribute__ ((aligned(16))); + + if (inc_x != 1) { + inc_x <<= 1; + + if (da_r == 0.0) { + + BLASLONG n1 = n & -2; + + if (da_i == 0.0) { + + while (j < n1) { + + x[i] = 0.0; + x[i + 1] = 0.0; + x[i + inc_x] = 0.0; + x[i + 1 + inc_x] = 0.0; + i += 2 * inc_x; + j += 2; + + } + + while (j < n) { + + x[i] = 0.0; + x[i + 1] = 0.0; + i += inc_x; + j++; + + } + + } else { + + while (j < n1) { + + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + temp1 = -da_i * x[i + 1 + inc_x]; + x[i + 1 + inc_x] = da_i * x[i + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; + + } + + while (j < n) { + + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; + + } + + + + } + + } else { + + + if (da_i == 0.0) { + BLASLONG n1 = n & -2; + + while (j < n1) { + + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + temp1 = da_r * x[i + inc_x]; + x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; + + } + + while (j < n) { + + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += inc_x; + j++; + + } + + } else { + + BLASLONG n1 = n & -8; + if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; + cscal_kernel_inc_8(n1, alpha, x, inc_x); + j = n1; + i = n1 * inc_x; + } + + while (j < n) { + + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; + + } + + } + + } + + return (0); + } + + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + alpha[0] = da_r; + alpha[1] = da_i; + + if (da_r == 0.0) + if (da_i == 0) + cscal_kernel_16_zero(n1, x); + else + cscal_kernel_16_zero_r(n1, alpha, x); + else + if (da_i == 0) + cscal_kernel_16_zero_i(n1, alpha, x); + else + cscal_kernel_16(n1, alpha, x); + + i = n1 << 1; + j = n1; + } + + + if (da_r == 0.0) { + + if (da_i == 0.0) { + + while (j < n) { + + x[i] = 0.0; + x[i + 1] = 0.0; + i += 2; + j++; + + } + + } else { + + while (j < n) { + + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += 2; + j++; + + } + + } + + } else { + + if (da_i == 0.0) { + + while (j < n) { + + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += 2; + j++; + + } + + } else { + + while (j < n) { + + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += 2; + j++; + + } + + } + + } + + return (0); +} diff --git a/kernel/zarch/cswap.c b/kernel/zarch/cswap.c new file mode 100644 index 000000000..256995d50 --- /dev/null +++ b/kernel/zarch/cswap.c @@ -0,0 +1,183 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile( + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" + + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vl %%v4, 64(%%r1,%2) \n\t" + "vl %%v5, 80(%%r1,%2) \n\t" + "vl %%v6, 96(%%r1,%2) \n\t" + "vl %%v7, 112(%%r1,%2) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v1, 16(%%r1,%1) \n\t" + "vst %%v2, 32(%%r1,%1) \n\t" + "vst %%v3, 48(%%r1,%1) \n\t" + "vst %%v4, 64(%%r1,%1) \n\t" + "vst %%v5, 80(%%r1,%1) \n\t" + "vst %%v6, 96(%%r1,%1) \n\t" + "vst %%v7, 112(%%r1,%1) \n\t" + + "vl %%v0, 128(%%r1,%2) \n\t" + "vl %%v1, 144(%%r1,%2) \n\t" + "vl %%v2, 160(%%r1,%2) \n\t" + "vl %%v3, 176(%%r1,%2) \n\t" + "vl %%v4, 192(%%r1,%2) \n\t" + "vl %%v5, 208(%%r1,%2) \n\t" + "vl %%v6, 224(%%r1,%2) \n\t" + "vl %%v7, 240(%%r1,%2) \n\t" + "vst %%v0, 128(%%r1,%1) \n\t" + "vst %%v1, 144(%%r1,%1) \n\t" + "vst %%v2, 160(%%r1,%1) \n\t" + "vst %%v3, 176(%%r1,%1) \n\t" + "vst %%v4, 192(%%r1,%1) \n\t" + "vst %%v5, 208(%%r1,%1) \n\t" + "vst %%v6, 224(%%r1,%1) \n\t" + "vst %%v7, 240(%%r1,%1) \n\t" + + "vst %%v16, 0(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2, inc_y2; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1 )) + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + cswap_kernel_32(n1, x, y); + i=n1; + ix = 2* n1; + iy = 2* n1; + } + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += 2 ; + iy += 2 ; + i++ ; + + + } + + + } + else + { + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c new file mode 100644 index 000000000..b74af5d37 --- /dev/null +++ b/kernel/zarch/damax.c @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + maxf = damax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); + + } else { + + maxf=ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c new file mode 100644 index 000000000..4cf5e88b1 --- /dev/null +++ b/kernel/zarch/damin.c @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = damin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); + + } else { + + minf=ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index 7a42a0863..fea431c34 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -23,8 +23,7 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - +*****************************************************************************/ #include "common.h" #include @@ -35,80 +34,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ABS fabsf #endif +static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT asum; - + __asm__ ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + "vl %%v20, 64(%%r1,%2) \n\t" + "vl %%v21, 80(%%r1,%2) \n\t" + "vl %%v22, 96(%%r1,%2) \n\t" + "vl %%v23, 112(%%r1,%2) \n\t" -static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { - FLOAT asum ; - __asm__ ( - "pfd 1, 0(%[ptr_x]) \n\t" - "sllg %%r0,%[n],3 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_temp] ) \n\t" - "vlm %%v24,%%v31, 0(%[ptr_temp] ) \n\t" - - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v0,%%v0,%%v24 \n\t" - "vfadb %%v1,%%v1,%%v25 \n\t" - "vfadb %%v2,%%v2,%%v26 \n\t" - "vfadb %%v3,%%v3,%%v27 \n\t" - "vfadb %%v0,%%v0,%%v28 \n\t" - "vfadb %%v1,%%v1,%%v29 \n\t" - "vfadb %%v2,%%v2,%%v30 \n\t" - "vfadb %%v3,%%v3,%%v31 \n\t" - - "vlm %%v24,%%v31, 128(%[ptr_temp]) \n\t" - - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - "la %[ptr_temp],256(%[ptr_temp]) \n\t" - "vfadb %%v0,%%v0,%%v24 \n\t" - "vfadb %%v1,%%v1,%%v25 \n\t" - "vfadb %%v2,%%v2,%%v26 \n\t" - "vfadb %%v3,%%v3,%%v27 \n\t" - "vfadb %%v0,%%v0,%%v28 \n\t" - "vfadb %%v1,%%v1,%%v29 \n\t" - "vfadb %%v2,%%v2,%%v30 \n\t" - "vfadb %%v3,%%v3,%%v31 \n\t" - - "clgrjl %[ptr_temp],%%r0,1b \n\t" - "vfadb %%v24,%%v0,%%v1 \n\t" - "vfadb %%v25,%%v2,%%v3 \n\t" - "vfadb %%v0,%%v25,%%v24 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %[asum],%%f0 \n\t" - : [asum] "=f"(asum),[ptr_temp] "+&a"(x) - : [mem] "m"( *(const double (*)[n])x ), [n] "r"(n), [ptr_x] "a"(x) - : "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return asum; + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v0,%%v0,%%v16 \n\t" + "vfadb %%v1,%%v1,%%v17 \n\t" + "vfadb %%v2,%%v2,%%v18 \n\t" + "vfadb %%v3,%%v3,%%v19 \n\t" + "vfadb %%v0,%%v0,%%v20 \n\t" + "vfadb %%v1,%%v1,%%v21 \n\t" + "vfadb %%v2,%%v2,%%v22 \n\t" + "vfadb %%v3,%%v3,%%v23 \n\t" + + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + "vl %%v20, 192(%%r1,%2) \n\t" + "vl %%v21, 208(%%r1,%2) \n\t" + "vl %%v22, 224(%%r1,%2) \n\t" + "vl %%v23, 240(%%r1,%2) \n\t" + + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfadb %%v0,%%v0,%%v16 \n\t" + "vfadb %%v1,%%v1,%%v17 \n\t" + "vfadb %%v2,%%v2,%%v18 \n\t" + "vfadb %%v3,%%v3,%%v19 \n\t" + "vfadb %%v0,%%v0,%%v20 \n\t" + "vfadb %%v1,%%v1,%%v21 \n\t" + "vfadb %%v2,%%v2,%%v22 \n\t" + "vfadb %%v3,%%v3,%%v23 \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b \n\t" + "vfadb %%v0,%%v0,%%v1 \n\t" + "vfadb %%v0,%%v0,%%v2 \n\t" + "vfadb %%v0,%%v0,%%v3 \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "ldr %0,%%f0 " + :"=f"(asum) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" + ); + + return asum; } - - - FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; BLASLONG j = 0; diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index 16f82a587..e8823745e 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -25,98 +25,99 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" -#define PREFETCH_INS 1 -#if defined(Z13_A) -#include - -static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) -{ - BLASLONG i = 0; - __vector double v_a = {alpha,alpha}; - __vector double * v_y=(__vector double *)y; - __vector double * v_x=(__vector double *)x; - - for(; i -#endif - -#ifdef HAVE_KERNEL_4x4 - -#elif HAVE_KERNEL_4x4_VEC - static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) { - BLASLONG i; - FLOAT x0,x1,x2,x3; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - __vector double v_x0 = {x0,x0}; - __vector double v_x1 = {x1,x1}; - __vector double v_x2 = {x2,x2}; - __vector double v_x3 = {x3,x3}; - __vector double* v_y =(__vector double*)y; - __vector double* va0 = (__vector double*)ap[0]; - __vector double* va1 = (__vector double*)ap[1]; - __vector double* va2 = (__vector double*)ap[2]; - __vector double* va3 = (__vector double*)ap[3]; + __asm__ volatile ( + "vlrepg %%v0,0(%5) \n\t" + "vlrepg %%v1,8(%5) \n\t" + "vlrepg %%v2,16(%5) \n\t" + "vlrepg %%v3,24(%5) \n\t" + "vlrepg %%v4,%7 \n\t" + "vfmdb %%v0,%%v0,%%v4 \n\t" + "vfmdb %%v1,%%v1,%%v4 \n\t" + "vfmdb %%v2,%%v2,%%v4 \n\t" + "vfmdb %%v3,%%v3,%%v4 \n\t" + "xgr %%r1,%%r1 \n\t" - for ( i=0; i< n/2; i+=2 ) - { - v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; - v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] + v_x2 * va2[i+1] + v_x3 * va3[i+1] ; - } -} + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" -#else + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" -static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3; - FLOAT x[4] __attribute__ ((aligned (16))); - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,0(%%r1,%3) \n\t" + "vl %%v19,0(%%r1,%4) \n\t" + "vl %%v20,16(%%r1,%1) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,16(%%r1,%3) \n\t" + "vl %%v23,16(%%r1,%4) \n\t" + "vl %%v24,32(%%r1,%1) \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vl %%v28,48(%%r1,%1) \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vl %%v31,48(%%r1,%4) \n\t" - for ( i=0; i<4; i++) - x[i] = xo[i] * *alpha; + "vl %%v4,0(%%r1,%6) \n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,0(%%r1,%6) \n\t" - for ( i=0; i< n; i+=4 ) - { - y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; - y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; - y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; - y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; - } -} + "vl %%v4,16(%%r1,%6) \n\t" + "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,16(%%r1,%6) \n\t" + "vl %%v4,32(%%r1,%6) \n\t" + "vfmadb %%v4,%%v24,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v25,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v26,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v27,%%v3,%%v4 \n\t" + "vst %%v4,32(%%r1,%6) \n\t" -#endif + "vl %%v4,48(%%r1,%6) \n\t" + "vfmadb %%v4,%%v28,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v29,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v30,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v31,%%v3,%%v4 \n\t" + "vst %%v4,48(%%r1,%6) \n\t" -#ifdef HAVE_KERNEL_4x2 + "vl %%v16,64(%%r1,%1) \n\t" + "vl %%v17,64(%%r1,%2) \n\t" + "vl %%v18,64(%%r1,%3) \n\t" + "vl %%v19,64(%%r1,%4) \n\t" + "vl %%v20,80(%%r1,%1) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,80(%%r1,%3) \n\t" + "vl %%v23,80(%%r1,%4) \n\t" + "vl %%v24,96(%%r1,%1) \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vl %%v28,112(%%r1,%1) \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vl %%v31,112(%%r1,%4) \n\t" -#elif HAVE_KERNEL_4x2_VEC + "vl %%v4,64(%%r1,%6) \n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,64(%%r1,%6) \n\t" -static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - BLASLONG i; - FLOAT x0,x1; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - __vector double v_x0 = {x0,x0}; - __vector double v_x1 = {x1,x1}; - __vector double* v_y =(__vector double*)y; - __vector double* va0 = (__vector double*)ap[0]; - __vector double* va1 = (__vector double*)ap[1]; + "vl %%v4,80(%%r1,%6) \n\t" + "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,80(%%r1,%6) \n\t" - for ( i=0; i< n/2; i+=2 ) - { - v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; - v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ; - } -} -#else + "vl %%v4,96(%%r1,%6) \n\t" + "vfmadb %%v4,%%v24,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v25,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v26,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v27,%%v3,%%v4 \n\t" + "vst %%v4,96(%%r1,%6) \n\t" -static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - BLASLONG i; - FLOAT *a0,*a1; - FLOAT x[4] __attribute__ ((aligned (16))); - a0 = ap[0]; - a1 = ap[1]; - - for ( i=0; i<2; i++) - x[i] = xo[i] * *alpha; - - for ( i=0; i< n; i+=4 ) - { - y[i] += a0[i]*x[0] + a1[i]*x[1]; - y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1]; - y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1]; - y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1]; - } -} - - -#endif - -#ifdef HAVE_KERNEL_4x1 - -#elif HAVE_KERNEL_4x1_VEC -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0; - x0 = xo[0] * *alpha; - __vector double v_x0 = {x0,x0}; - __vector double* v_y =(__vector double*)y; - __vector double* va0 = (__vector double*)ap; - - for ( i=0; i< n/2; i+=2 ) - { - v_y[i] += v_x0 * va0[i] ; - v_y[i+1] += v_x0 * va0[i+1] ; - } + "vl %%v4,112(%%r1,%6) \n\t" + "vfmadb %%v4,%%v28,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v29,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v30,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v31,%%v3,%%v4 \n\t" + "vst %%v4,112(%%r1,%6) \n\t" - + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,0(%%r1,%3) \n\t" + "vl %%v19,0(%%r1,%4) \n\t" + "vl %%v20,16(%%r1,%1) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,16(%%r1,%3) \n\t" + "vl %%v23,16(%%r1,%4) \n\t" + + "vl %%v4,0(%%r1,%6) \n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,0(%%r1,%6) \n\t" + + "vl %%v4,16(%%r1,%6) \n\t" + "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,16(%%r1,%6) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) { - BLASLONG i; - FLOAT *a0; - FLOAT x[4] __attribute__ ((aligned (16))); - a0 = ap; + __asm__ volatile ( + "vlrepg %%v0,0(%3) \n\t" + "vlrepg %%v1,8(%3) \n\t" + "vlrepg %%v2,%5 \n\t" + "vfmdb %%v0,%%v0,%%v2 \n\t" + "vfmdb %%v1,%%v1,%%v2 \n\t" + "xgr %%r1,%%r1 \n\t" - for ( i=0; i<1; i++) - x[i] = xo[i] * *alpha; + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" - for ( i=0; i< n; i+=4 ) - { - y[i] += a0[i]*x[0]; - y[i+1] += a0[i+1]*x[0]; - y[i+2] += a0[i+2]*x[0]; - y[i+3] += a0[i+3]*x[0]; - } + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,16(%%r1,%1) \n\t" + "vl %%v19,16(%%r1,%2) \n\t" + "vl %%v20,32(%%r1,%1) \n\t" + "vl %%v21,32(%%r1,%2) \n\t" + "vl %%v22,48(%%r1,%1) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "vl %%v24,64(%%r1,%1) \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vl %%v26,80(%%r1,%1) \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vl %%v28,96(%%r1,%1) \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vl %%v30,112(%%r1,%1) \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + + "vl %%v2,0(%%r1,%4) \n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2 \n\t" + "vst %%v2,0(%%r1,%4) \n\t" + + "vl %%v2,16(%%r1,%4) \n\t" + "vfmadb %%v2,%%v18,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v19,%%v1,%%v2 \n\t" + "vst %%v2,16(%%r1,%4) \n\t" + + "vl %%v2,32(%%r1,%4) \n\t" + "vfmadb %%v2,%%v20,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v21,%%v1,%%v2 \n\t" + "vst %%v2,32(%%r1,%4) \n\t" + + "vl %%v2,48(%%r1,%4) \n\t" + "vfmadb %%v2,%%v22,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v23,%%v1,%%v2 \n\t" + "vst %%v2,48(%%r1,%4) \n\t" + + "vl %%v2,64(%%r1,%4) \n\t" + "vfmadb %%v2,%%v24,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v25,%%v1,%%v2 \n\t" + "vst %%v2,64(%%r1,%4) \n\t" + + "vl %%v2,80(%%r1,%4) \n\t" + "vfmadb %%v2,%%v26,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v27,%%v1,%%v2 \n\t" + "vst %%v2,80(%%r1,%4) \n\t" + + "vl %%v2,96(%%r1,%4) \n\t" + "vfmadb %%v2,%%v28,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v29,%%v1,%%v2 \n\t" + "vst %%v2,96(%%r1,%4) \n\t" + + "vl %%v2,112(%%r1,%4) \n\t" + "vfmadb %%v2,%%v30,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v31,%%v1,%%v2 \n\t" + "vst %%v2,112(%%r1,%4) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,16(%%r1,%1) \n\t" + "vl %%v19,16(%%r1,%2) \n\t" + + "vl %%v2,0(%%r1,%4) \n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2 \n\t" + "vst %%v2,0(%%r1,%4) \n\t" + + "vl %%v2,16(%%r1,%4) \n\t" + "vfmadb %%v2,%%v18,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v19,%%v1,%%v2 \n\t" + "vst %%v2,16(%%r1,%4) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vlrepg %%v0,0(%2) \n\t" + "vlrepg %%v1,%4 \n\t" + "vfmdb %%v0,%%v0,%%v1 \n\t" + "xgr %%r1,%%r1 \n\t" -#endif + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" - + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,64(%%r1,%1) \n\t" + "vl %%v21,80(%%r1,%1) \n\t" + "vl %%v22,96(%%r1,%1) \n\t" + "vl %%v23,112(%%r1,%1) \n\t" + + "vl %%v1,0(%%r1,%3) \n\t" + "vfmadb %%v1,%%v16,%%v0,%%v1 \n\t" + "vst %%v1,0(%%r1,%3) \n\t" + + "vl %%v1,16(%%r1,%3) \n\t" + "vfmadb %%v1,%%v17,%%v0,%%v1 \n\t" + "vst %%v1,16(%%r1,%3) \n\t" + + "vl %%v1,32(%%r1,%3) \n\t" + "vfmadb %%v1,%%v18,%%v0,%%v1 \n\t" + "vst %%v1,32(%%r1,%3) \n\t" + + "vl %%v1,48(%%r1,%3) \n\t" + "vfmadb %%v1,%%v19,%%v0,%%v1 \n\t" + "vst %%v1,48(%%r1,%3) \n\t" + + "vl %%v1,64(%%r1,%3) \n\t" + "vfmadb %%v1,%%v20,%%v0,%%v1 \n\t" + "vst %%v1,64(%%r1,%3) \n\t" + + "vl %%v1,80(%%r1,%3) \n\t" + "vfmadb %%v1,%%v21,%%v0,%%v1 \n\t" + "vst %%v1,80(%%r1,%3) \n\t" + + "vl %%v1,96(%%r1,%3) \n\t" + "vfmadb %%v1,%%v22,%%v0,%%v1 \n\t" + "vst %%v1,96(%%r1,%3) \n\t" + + "vl %%v1,112(%%r1,%3) \n\t" + "vfmadb %%v1,%%v23,%%v0,%%v1 \n\t" + "vst %%v1,112(%%r1,%3) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + + "vl %%v1,0(%%r1,%3) \n\t" + "vfmadb %%v1,%%v16,%%v0,%%v1 \n\t" + "vst %%v1,0(%%r1,%3) \n\t" + + "vl %%v1,16(%%r1,%3) \n\t" + "vfmadb %%v1,%%v17,%%v0,%%v1 \n\t" + "vst %%v1,16(%%r1,%3) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { BLASLONG i; - - for ( i=0; i -#endif #define NBMAX 2048 -#ifdef HAVE_KERNEL_4x4 - -#elif HAVE_KERNEL_4x4_VEC - static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - __vector double* va0 = (__vector double*)ap[0]; - __vector double* va1 = (__vector double*)ap[1]; - __vector double* va2 = (__vector double*)ap[2]; - __vector double* va3 = (__vector double*)ap[3]; - __vector double* v_x =(__vector double*)x; - __vector double temp0 = {0,0}; - __vector double temp1 = {0,0}; - __vector double temp2 = {0,0}; - __vector double temp3 = {0,0}; + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "xgr %%r1,%%r1 \n\t" - for ( i=0; i< n/2; i+=2 ) - { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; - temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ; - temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1] ; - temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1] ; - } - - y[0] = temp0[0] + temp0[1]; - y[1] = temp1[0] + temp1[1]; - y[2] = temp2[0] + temp2[1]; - y[3] = temp3[0] + temp3[1];; + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 1,1024(%%r1,%5) \n\t" + + "vl %%v16,0(%%r1,%5) \n\t" + "vl %%v17,16(%%r1,%5) \n\t" + "vl %%v18,32(%%r1,%5) \n\t" + "vl %%v19,48(%%r1,%5) \n\t" + "vl %%v20,64(%%r1,%5) \n\t" + "vl %%v21,80(%%r1,%5) \n\t" + "vl %%v22,96(%%r1,%5) \n\t" + "vl %%v23,112(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3 \n\t" + + "vl %%v28,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v28,%%v0 \n\t" + "vl %%v29,16(%%r1,%2) \n\t" + "vfmadb %%v1,%%v17,%%v29,%%v1 \n\t" + "vl %%v30,16(%%r1,%3) \n\t" + "vfmadb %%v2,%%v17,%%v30,%%v2 \n\t" + "vl %%v31,16(%%r1,%4) \n\t" + "vfmadb %%v3,%%v17,%%v31,%%v3 \n\t" + + "vl %%v24,32(%%r1,%1) \n\t" + "vfmadb %%v0,%%v18,%%v24,%%v0 \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vfmadb %%v1,%%v18,%%v25,%%v1 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2 \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vfmadb %%v3,%%v18,%%v27,%%v3 \n\t" + + "vl %%v28,48(%%r1,%1) \n\t" + "vfmadb %%v0,%%v19,%%v28,%%v0 \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vfmadb %%v1,%%v19,%%v29,%%v1 \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vfmadb %%v2,%%v19,%%v30,%%v2 \n\t" + "vl %%v31,48(%%r1,%4) \n\t" + "vfmadb %%v3,%%v19,%%v31,%%v3 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1 \n\t" + "vl %%v26,64(%%r1,%3) \n\t" + "vfmadb %%v2,%%v20,%%v26,%%v2 \n\t" + "vl %%v27,64(%%r1,%4) \n\t" + "vfmadb %%v3,%%v20,%%v27,%%v3 \n\t" + + "vl %%v28,80(%%r1,%1) \n\t" + "vfmadb %%v0,%%v21,%%v28,%%v0 \n\t" + "vl %%v29,80(%%r1,%2) \n\t" + "vfmadb %%v1,%%v21,%%v29,%%v1 \n\t" + "vl %%v30,80(%%r1,%3) \n\t" + "vfmadb %%v2,%%v21,%%v30,%%v2 \n\t" + "vl %%v31,80(%%r1,%4) \n\t" + "vfmadb %%v3,%%v21,%%v31,%%v3 \n\t" + + "vl %%v24,96(%%r1,%1) \n\t" + "vfmadb %%v0,%%v22,%%v24,%%v0 \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vfmadb %%v1,%%v22,%%v25,%%v1 \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vfmadb %%v2,%%v22,%%v26,%%v2 \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vfmadb %%v3,%%v22,%%v27,%%v3 \n\t" + + "vl %%v28,112(%%r1,%1) \n\t" + "vfmadb %%v0,%%v23,%%v28,%%v0 \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vfmadb %%v1,%%v23,%%v29,%%v1 \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vfmadb %%v2,%%v23,%%v30,%%v2 \n\t" + "vl %%v31,112(%%r1,%4) \n\t" + "vfmadb %%v3,%%v23,%%v31,%%v3 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%5) \n\t" + "vl %%v17,16(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3 \n\t" + + "vl %%v28,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v28,%%v0 \n\t" + "vl %%v29,16(%%r1,%2) \n\t" + "vfmadb %%v1,%%v17,%%v29,%%v1 \n\t" + "vl %%v30,16(%%r1,%3) \n\t" + "vfmadb %%v2,%%v17,%%v30,%%v2 \n\t" + "vl %%v31,16(%%r1,%4) \n\t" + "vfmadb %%v3,%%v17,%%v31,%%v3 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepg %%v4,%%v0,1 \n\t" + "adbr %%f0,%%f4 \n\t" + "std %%f0,0(%6) \n\t" + "vrepg %%v4,%%v1,1 \n\t" + "adbr %%f1,%%f4 \n\t" + "std %%f1,8(%6) \n\t" + "vrepg %%v4,%%v2,1 \n\t" + "adbr %%f2,%%f4 \n\t" + "std %%f2,16(%6) \n\t" + "vrepg %%v4,%%v3,1 \n\t" + "adbr %%f3,%%f4 \n\t" + "std %%f3,24(%6) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else -static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - FLOAT temp3 = 0.0; - - for ( i=0; i< n; i+=4 ) - { - temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; - temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; - temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3]; - temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3]; - } - y[0] = temp0; - y[1] = temp1; - y[2] = temp2; - y[3] = temp3; -} - -#endif - -#ifdef HAVE_KERNEL_4x2 - -#elif HAVE_KERNEL_4x2_VEC static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - __vector double* va0 = (__vector double*)ap[0]; - __vector double* va1 = (__vector double*)ap[1]; - __vector double* v_x =(__vector double*)x; - __vector double temp0 = {0,0}; - __vector double temp1 = {0,0}; + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "xgr %%r1,%%r1 \n\t" - for ( i=0; i< n/2; i+=2 ) - { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; - temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ; - } - - y[0] = temp0[0] + temp0[1]; - y[1] = temp1[0] + temp1[1]; + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" + + "vl %%v26,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v26,%%v0 \n\t" + "vl %%v27,16(%%r1,%2) \n\t" + "vfmadb %%v1,%%v17,%%v27,%%v1 \n\t" + + "vl %%v28,32(%%r1,%1) \n\t" + "vfmadb %%v0,%%v18,%%v28,%%v0 \n\t" + "vl %%v29,32(%%r1,%2) \n\t" + "vfmadb %%v1,%%v18,%%v29,%%v1 \n\t" + + "vl %%v30,48(%%r1,%1) \n\t" + "vfmadb %%v0,%%v19,%%v30,%%v0 \n\t" + "vl %%v31,48(%%r1,%2) \n\t" + "vfmadb %%v1,%%v19,%%v31,%%v1 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1 \n\t" + + "vl %%v26,80(%%r1,%1) \n\t" + "vfmadb %%v0,%%v21,%%v26,%%v0 \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vfmadb %%v1,%%v21,%%v27,%%v1 \n\t" + + "vl %%v28,96(%%r1,%1) \n\t" + "vfmadb %%v0,%%v22,%%v28,%%v0 \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vfmadb %%v1,%%v22,%%v29,%%v1 \n\t" + + "vl %%v30,112(%%r1,%1) \n\t" + "vfmadb %%v0,%%v23,%%v30,%%v0 \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + "vfmadb %%v1,%%v23,%%v31,%%v1 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" + + "vl %%v26,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v26,%%v0 \n\t" + "vl %%v27,16(%%r1,%2) \n\t" + "vfmadb %%v1,%%v17,%%v27,%%v1 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepg %%v2,%%v0,1 \n\t" + "adbr %%f0,%%f2 \n\t" + "std %%f0,0(%4) \n\t" + "vrepg %%v2,%%v1,1 \n\t" + "adbr %%f1,%%f2 \n\t" + "std %%f1,8(%4) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else -static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - - BLASLONG i; - FLOAT *a0,*a1; - a0 = ap[0]; - a1 = ap[1]; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - - for ( i=0; i< n; i+=4 ) - { - temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; - temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; - } - y[0] = temp0; - y[1] = temp1; - -} -#endif - -#ifdef HAVE_KERNEL_4x1 - -#elif HAVE_KERNEL_4x1_VEC static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { - BLASLONG i; - __vector double* va0 = (__vector double*)a0; - __vector double* v_x =(__vector double*)x; - __vector double temp0 = {0,0}; + __asm__ volatile ( + "vzero %%v0 \n\t" + "xgr %%r1,%%r1 \n\t" - for ( i=0; i< n/2; i+=2 ) - { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; - } - - y[0] = temp0[0] + temp0[1]; -} -#else -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) -{ - BLASLONG i; - - - FLOAT temp0 = 0.0; + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" - for ( i=0; i< n; i+=4 ) - { - temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; - } - y[0] = temp0; + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + + "vl %%v25,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" + + "vl %%v26,32(%%r1,%1) \n\t" + "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" + + "vl %%v27,48(%%r1,%1) \n\t" + "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" + + "vl %%v28,64(%%r1,%1) \n\t" + "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" + + "vl %%v29,80(%%r1,%1) \n\t" + "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" + + "vl %%v30,96(%%r1,%1) \n\t" + "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" + + "vl %%v31,112(%%r1,%1) \n\t" + "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + + "vl %%v25,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "std %%f0,0(%3) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#endif - + static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for ( i=0; i 0) { + + maxf = dmax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=x[0]; + i++; + } + + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { + + maxf=x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c new file mode 100644 index 000000000..d7c86735f --- /dev/null +++ b/kernel/zarch/dmin.c @@ -0,0 +1,182 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT min; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return min; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = dmin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=x[0]; + i++; + } + + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { + + minf=x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index bf29538c7..c91f95800 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,176 +27,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA) +static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { - __asm__ ( - "pfd 2, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "lgdr %%r1,%[cos] \n\t" - "vlvgp %%v0,%%r1,%%r1 \n\t" - "lgdr %%r1,%[sin] \n\t" - "vlvgp %%v1,%%r1,%%r1 \n\t" - "srlg %[n_tmp],%[n_tmp],5 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 48(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 112(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 176(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 240(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 240(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 240(%%r1,%[ptr_y]) \n\t" - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[n])x), - [mem_y] "+m" (*(double (*)[n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA) - : "cc", "r1" ,"v0","v1","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - + __asm__ ( + "vlrepg %%v0,%3 \n\t" + "vlrepg %%v1,%4 \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%1) \n\t" + "vst %%v29, 16(%%r1,%1) \n\t" + "vst %%v30, 32(%%r1,%1) \n\t" + "vst %%v31, 48(%%r1,%1) \n\t" + "vst %%v20, 0(%%r1,%2) \n\t" + "vst %%v21, 16(%%r1,%2) \n\t" + "vst %%v22, 32(%%r1,%2) \n\t" + "vst %%v23, 48(%%r1,%2) \n\t" + + "vl %%v24, 64(%%r1,%1) \n\t" + "vl %%v25, 80(%%r1,%1) \n\t" + "vl %%v26, 96(%%r1,%1) \n\t" + "vl %%v27, 112(%%r1,%1) \n\t" + "vl %%v16, 64(%%r1,%2) \n\t" + "vl %%v17, 80(%%r1,%2) \n\t" + "vl %%v18, 96(%%r1,%2) \n\t" + "vl %%v19, 112(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%1) \n\t" + "vst %%v29, 80(%%r1,%1) \n\t" + "vst %%v30, 96(%%r1,%1) \n\t" + "vst %%v31, 112(%%r1,%1) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%1) \n\t" + "vst %%v29, 144(%%r1,%1) \n\t" + "vst %%v30, 160(%%r1,%1) \n\t" + "vst %%v31, 176(%%r1,%1) \n\t" + "vst %%v20, 128(%%r1,%2) \n\t" + "vst %%v21, 144(%%r1,%2) \n\t" + "vst %%v22, 160(%%r1,%2) \n\t" + "vst %%v23, 176(%%r1,%2) \n\t" + + "vl %%v24, 192(%%r1,%1) \n\t" + "vl %%v25, 208(%%r1,%1) \n\t" + "vl %%v26, 224(%%r1,%1) \n\t" + "vl %%v27, 240(%%r1,%1) \n\t" + "vl %%v16, 192(%%r1,%2) \n\t" + "vl %%v17, 208(%%r1,%2) \n\t" + "vl %%v18, 224(%%r1,%2) \n\t" + "vl %%v19, 240(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 192(%%r1,%1) \n\t" + "vst %%v29, 208(%%r1,%1) \n\t" + "vst %%v30, 224(%%r1,%1) \n\t" + "vst %%v31, 240(%%r1,%1) \n\t" + "vst %%v20, 192(%%r1,%2) \n\t" + "vst %%v21, 208(%%r1,%2) \n\t" + "vst %%v22, 224(%%r1,%2) \n\t" + "vst %%v23, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) @@ -214,8 +204,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT BLASLONG n1 = n & -32; if ( n1 > 0 ) { - - drot_kernel_32(n1, x, y, c, s); + FLOAT cosa,sina; + cosa=c; + sina=s; + drot_kernel_32(n1, x, y, &cosa, &sina); i=n1; } @@ -229,6 +221,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } + } else { @@ -250,3 +243,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } + diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index e29f51012..ccc6dd95d 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -27,135 +27,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#ifdef Z13_A -static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x ) +static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { + __asm__ volatile ( + "vlrepg %%v0,%1 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%2) \n\t" + "vfmdb %%v24,%%v24,%%v0 \n\t" + "vst %%v24, 0(%%r1,%2) \n\t" + "vl %%v25, 16(%%r1,%2) \n\t" + "vfmdb %%v25,%%v25,%%v0 \n\t" + "vst %%v25, 16(%%r1,%2) \n\t" + "vl %%v26, 32(%%r1,%2) \n\t" + "vfmdb %%v26,%%v26,%%v0 \n\t" + "vst %%v26, 32(%%r1,%2) \n\t" + "vl %%v27, 48(%%r1,%2) \n\t" + "vfmdb %%v27,%%v27,%%v0 \n\t" + "vst %%v27, 48(%%r1,%2) \n\t" + "vl %%v24, 64(%%r1,%2) \n\t" + "vfmdb %%v24,%%v24,%%v0 \n\t" + "vst %%v24, 64(%%r1,%2) \n\t" + "vl %%v25, 80(%%r1,%2) \n\t" + "vfmdb %%v25,%%v25,%%v0 \n\t" + "vst %%v25, 80(%%r1,%2) \n\t" + "vl %%v26, 96(%%r1,%2) \n\t" + "vfmdb %%v26,%%v26,%%v0 \n\t" + "vst %%v26, 96(%%r1,%2) \n\t" + "vl %%v27, 112(%%r1,%2) \n\t" + "vfmdb %%v27,%%v27,%%v0 \n\t" + "vst %%v27, 112(%%r1,%2) \n\t" + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v24","v25","v26","v27" + ); +} - - __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" - "lgdr %%r0,%[alpha] \n\t" - "vlvgp %%v0,%%r0,%%r0 \n\t" - "srlg %[n],%[n],4 \n\t" - "vlr %%v1,%%v0 \n\t" - "vlm %%v16,%%v23, 0(%[x_ptr]) \n\t" - "la %[x_ptr], 128(%[x_ptr]) \n\t" - "aghik %[n], %[n], -1 \n\t" - "jle 2f \n\t" - ".align 16 \n\t" - "1: \n\t" - "vfmdb %%v24, %%v16, %%v0 \n\t" - "vfmdb %%v25, %%v17, %%v0 \n\t" - "vfmdb %%v26, %%v18, %%v0 \n\t" - "vfmdb %%v27, %%v19, %%v1 \n\t" - "vlm %%v16,%%v19, 0(%[x_ptr]) \n\t" - "vfmdb %%v28, %%v20, %%v0 \n\t" - "vfmdb %%v29, %%v21, %%v1 \n\t" - "vfmdb %%v30, %%v22, %%v0 \n\t" - "vfmdb %%v31, %%v23, %%v1 \n\t" - "vlm %%v20,%%v23, 64(%[x_ptr]) \n\t" - "lay %[x_ptr], -128(%[x_ptr]) \n\t" - "vstm %%v24,%%v31, 0(%[x_ptr]) \n\t" - "la %[x_ptr],256(%[x_ptr]) \n\t" - "brctg %[n],1b \n\t" - "2: \n\t" - "vfmdb %%v24, %%v16, %%v0 \n\t" - "vfmdb %%v25, %%v17, %%v1 \n\t" - "vfmdb %%v26, %%v18, %%v0 \n\t" - "vfmdb %%v27, %%v19, %%v1 \n\t" - "lay %[x_ptr] , -128(%[x_ptr]) \n\t" - "vfmdb %%v28, %%v20, %%v0 \n\t" - "vfmdb %%v29, %%v21, %%v1 \n\t" - "vfmdb %%v30, %%v22, %%v0 \n\t" - "vfmdb %%v31, %%v23, %%v1 \n\t" - "vstm %%v24,%%v31, 0(%[x_ptr]) \n\t" - : [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x),[n] "+&r"(n) - : [alpha] "f"(da) - :"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21", - "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - } -#else -static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x ) +static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" - /* faster than sequence of triples(vl vfmd vst) (tested OPENBLAS_LOOPS=10000) */ - __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" - "lgdr %%r0,%[alpha] \n\t" - "vlvgp %%v0,%%r0,%%r0 \n\t" - "vlr %%v1,%%v0 \n\t" - "sllg %%r0,%[n],3 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%[x_ptr]) \n\t" - "vlm %%v16,%%v23, 0(%[x_ptr]) \n\t" - "vfmdb %%v16,%%v16,%%v0 \n\t" - "vfmdb %%v17,%%v17,%%v1 \n\t" - "vfmdb %%v18,%%v18,%%v0 \n\t" - "vfmdb %%v19,%%v19,%%v1 \n\t" - "vfmdb %%v20,%%v20,%%v0 \n\t" - "vfmdb %%v21,%%v21,%%v1 \n\t" - "vfmdb %%v22,%%v22,%%v0 \n\t" - "vfmdb %%v23,%%v23,%%v1 \n\t" - "vstm %%v16,%%v23, 0(%[x_ptr]) \n\t" - "vlm %%v24,%%v31,128(%[x_ptr]) \n\t" - "vfmdb %%v24,%%v24,%%v0 \n\t" - "vfmdb %%v25,%%v25,%%v1 \n\t" - "vfmdb %%v26,%%v26,%%v0 \n\t" - "vfmdb %%v27,%%v27,%%v1 \n\t" - "vfmdb %%v28,%%v28,%%v0 \n\t" - "vfmdb %%v29,%%v29,%%v1 \n\t" - "vfmdb %%v30,%%v30,%%v0 \n\t" - "vfmdb %%v31,%%v31,%%v1 \n\t" - "vstm %%v24,%%v31,128(%[x_ptr]) \n\t" - "la %[x_ptr], 256(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x) - : [n] "r"(n),[alpha] "f"(da) - :"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21", - "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - } -#endif -static void dscal_kernel_32_zero( BLASLONG n, FLOAT *x ) -{ - - __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" - "vzero %%v24 \n\t" - "sllg %%r0,%[n],3 \n\t" - "vzero %%v25 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%[x_ptr]) \n\t" - "vst %%v24, 0(%[x_ptr]) \n\t" - "vst %%v25, 16(%[x_ptr]) \n\t" - "vst %%v24, 32(%[x_ptr]) \n\t" - "vst %%v25, 48(%[x_ptr]) \n\t" - "vst %%v24, 64(%[x_ptr]) \n\t" - "vst %%v25, 80(%[x_ptr]) \n\t" - "vst %%v24, 96(%[x_ptr]) \n\t" - "vst %%v25, 112(%[x_ptr]) \n\t" - "vst %%v24, 128(%[x_ptr]) \n\t" - "vst %%v25, 144(%[x_ptr]) \n\t" - "vst %%v24, 160(%[x_ptr]) \n\t" - "vst %%v25, 176(%[x_ptr]) \n\t" - "vst %%v24, 192(%[x_ptr]) \n\t" - "vst %%v25, 208(%[x_ptr]) \n\t" - "vst %%v24, 224(%[x_ptr]) \n\t" - "vst %%v25, 240(%[x_ptr]) \n\t" - "la %[x_ptr],256(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "=m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x) - : [n] "r"(n) - :"cc" , "r0", "v24" ,"v25" - ); + "vst %%v24,0(%%r1,%1) \n\t" + "vst %%v25,16(%%r1,%1) \n\t" + "vst %%v26,32(%%r1,%1) \n\t" + "vst %%v27,48(%%r1,%1) \n\t" + "vst %%v24,64(%%r1,%1) \n\t" + "vst %%v25,80(%%r1,%1) \n\t" + "vst %%v26,96(%%r1,%1) \n\t" + "vst %%v27,112(%%r1,%1) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x) + :"memory","cc","r0","r1","v24","v25","v26","v27" + ); } - - - int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0,j=0; @@ -169,11 +109,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -16; if ( n1 > 0 ) { - dscal_kernel_32_zero(n1 , x); + dscal_kernel_16_zero(n1, x); j=n1; } @@ -188,10 +128,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -16; if ( n1 > 0 ) { - dscal_kernel_32(n1 , da , x); + dscal_kernel_16(n1, da, x); j=n1; } while(j < n) @@ -260,4 +200,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS } return 0; -} \ No newline at end of file +} + + diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c new file mode 100644 index 000000000..17461a029 --- /dev/null +++ b/kernel/zarch/dsdot.c @@ -0,0 +1,180 @@ +/*************************************************************************** +Copyright (c) 2013-2018,The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms,with or without +modification,are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice,this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice,this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL +DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static double dsdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + double dot; + + __asm__ volatile ( + "vzero %%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%3) \n\t" + "vfmsb %%v16,%%v16,%%v24 \n\t" + "vl %%v25,16(%%r1,%3) \n\t" + "vfmsb %%v17,%%v17,%%v25 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmsb %%v18,%%v18,%%v26 \n\t" + "vl %%v27,48(%%r1,%3) \n\t" + "vfmsb %%v19,%%v19,%%v27 \n\t" + "vl %%v28,64(%%r1,%3) \n\t" + "vfmsb %%v20,%%v20,%%v28 \n\t" + "vl %%v29,80(%%r1,%3) \n\t" + "vfmsb %%v21,%%v21,%%v29 \n\t" + "vl %%v30,96(%%r1,%3) \n\t" + "vfmsb %%v22,%%v22,%%v30 \n\t" + "vl %%v31,112(%%r1,%3) \n\t" + "vfmsb %%v23,%%v23,%%v31 \n\t" + + "vflls %%v24,%%v16 \n\t" + "vflls %%v25,%%v17 \n\t" + "vflls %%v26,%%v18 \n\t" + "vflls %%v27,%%v19 \n\t" + "vflls %%v28,%%v20 \n\t" + "vflls %%v29,%%v21 \n\t" + "vflls %%v30,%%v22 \n\t" + "vflls %%v31,%%v23 \n\t" + + "veslg %%v16,%%v16,32 \n\t" + "veslg %%v17,%%v17,32 \n\t" + "veslg %%v18,%%v18,32 \n\t" + "veslg %%v19,%%v19,32 \n\t" + "veslg %%v20,%%v20,32 \n\t" + "veslg %%v21,%%v21,32 \n\t" + "veslg %%v22,%%v22,32 \n\t" + "veslg %%v23,%%v23,32 \n\t" + + "vflls %%v16,%%v16 \n\t" + "vflls %%v17,%%v17 \n\t" + "vflls %%v18,%%v18 \n\t" + "vflls %%v19,%%v19 \n\t" + "vflls %%v20,%%v20 \n\t" + "vflls %%v21,%%v21 \n\t" + "vflls %%v22,%%v22 \n\t" + "vflls %%v23,%%v23 \n\t" + + "vfadb %%v16,%%v16,%%v24 \n\t" + "vfadb %%v17,%%v17,%%v25 \n\t" + "vfadb %%v18,%%v18,%%v26 \n\t" + "vfadb %%v19,%%v19,%%v27 \n\t" + "vfadb %%v20,%%v20,%%v28 \n\t" + "vfadb %%v21,%%v21,%%v29 \n\t" + "vfadb %%v22,%%v22,%%v30 \n\t" + "vfadb %%v23,%%v23,%%v31 \n\t" + "vfadb %%v16,%%v16,%%v20 \n\t" + "vfadb %%v17,%%v17,%%v21 \n\t" + "vfadb %%v18,%%v18,%%v22 \n\t" + "vfadb %%v19,%%v19,%%v23 \n\t" + "vfadb %%v16,%%v16,%%v18 \n\t" + "vfadb %%v17,%%v17,%%v19 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v0,%%v16,%%v0 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "ldr %0,%%f0 " + :"=f"(dot) + :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return dot; +} + +double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + double dot = 0.0 ; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -32; + + if ( n1 ) + dot = dsdot_kernel_32(n1,x,y); + + i = n1; + while(i < n) + { + + dot += y[i] * x[i] ; + i++ ; + + } + return(dot); + + + } + + BLASLONG n1 = n & -2; + + while(i < n1) + { + + dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; + ix += inc_x*2 ; + iy += inc_y*2 ; + i+=2 ; + + } + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index d7e079147..8070ef41a 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,217 +25,93 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - - #include "common.h" - - -#if defined(Z13_SWAP_A) -static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { - __asm__ volatile( - "pfd 1, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],5 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v16, 0(%%r1,%[ptr_x]) \n\t" + __asm__ volatile( + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 16(%%r1,%[ptr_x]) \n\t" + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vl %%v4, 64(%%r1,%2) \n\t" + "vl %%v5, 80(%%r1,%2) \n\t" + "vl %%v6, 96(%%r1,%2) \n\t" + "vl %%v7, 112(%%r1,%2) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v1, 16(%%r1,%1) \n\t" + "vst %%v2, 32(%%r1,%1) \n\t" + "vst %%v3, 48(%%r1,%1) \n\t" + "vst %%v4, 64(%%r1,%1) \n\t" + "vst %%v5, 80(%%r1,%1) \n\t" + "vst %%v6, 96(%%r1,%1) \n\t" + "vst %%v7, 112(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 32(%%r1,%[ptr_x]) \n\t" - - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 48(%%r1,%[ptr_x]) \n\t" - - "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_x]) \n\t" - - "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_x]) \n\t" - - "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_x]) \n\t" - - "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_x]) \n\t" - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v16, 128(%%r1,%[ptr_x]) \n\t" - - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 144(%%r1,%[ptr_x]) \n\t" - - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 160(%%r1,%[ptr_x]) \n\t" - - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 176(%%r1,%[ptr_x]) \n\t" - - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 192(%%r1,%[ptr_x]) \n\t" - - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 208(%%r1,%[ptr_x]) \n\t" - - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 224(%%r1,%[ptr_x]) \n\t" - - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 240(%%r1,%[ptr_x]) \n\t" - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[n])x), - [mem_y] "+m" (*(double (*)[n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23" - ,"v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; + "vl %%v0, 128(%%r1,%2) \n\t" + "vl %%v1, 144(%%r1,%2) \n\t" + "vl %%v2, 160(%%r1,%2) \n\t" + "vl %%v3, 176(%%r1,%2) \n\t" + "vl %%v4, 192(%%r1,%2) \n\t" + "vl %%v5, 208(%%r1,%2) \n\t" + "vl %%v6, 224(%%r1,%2) \n\t" + "vl %%v7, 240(%%r1,%2) \n\t" + "vst %%v0, 128(%%r1,%1) \n\t" + "vst %%v1, 144(%%r1,%1) \n\t" + "vst %%v2, 160(%%r1,%1) \n\t" + "vst %%v3, 176(%%r1,%1) \n\t" + "vst %%v4, 192(%%r1,%1) \n\t" + "vst %%v5, 208(%%r1,%1) \n\t" + "vst %%v6, 224(%%r1,%1) \n\t" + "vst %%v7, 240(%%r1,%1) \n\t" + "vst %%v16, 0(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else - -static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "pfd 2, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],5 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v16, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - - - "vl %%v0, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v1, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v2, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v3, 48(%%r1,%[ptr_y]) \n\t" - "vl %%v4, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v5, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v6, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v7, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v0, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v1, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v2, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v3, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v4, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v5, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v6, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v7, 112(%%r1,%[ptr_x]) \n\t" - - "vl %%v0, 128(%%r1,%[ptr_y]) \n\t" - "vl %%v1, 144(%%r1,%[ptr_y]) \n\t" - "vl %%v2, 160(%%r1,%[ptr_y]) \n\t" - "vl %%v3, 176(%%r1,%[ptr_y]) \n\t" - "vl %%v4, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v5, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v6, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v7, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v0, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v1, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v2, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v3, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v4, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v5, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v6, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v7, 240(%%r1,%[ptr_x]) \n\t" - - "vst %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[n])x), - [mem_y] "+m" (*(double (*)[n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - -} - -#endif - int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; @@ -284,5 +160,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, } - - diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c new file mode 100644 index 000000000..e7f096e0d --- /dev/null +++ b/kernel/zarch/icamax.c @@ -0,0 +1,319 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) +{ + BLASLONG iamax; + + __asm__ volatile ( + "vlef %%v0,0(%3),0 \n\t" + "vlef %%v1,4(%3),0 \n\t" + "vlef %%v0,8(%3),1 \n\t" + "vlef %%v1,12(%3),1 \n\t" + "vlef %%v0,16(%3),2 \n\t" + "vlef %%v1,20(%3),2 \n\t" + "vlef %%v0,24(%3),3 \n\t" + "vlef %%v1,28(%3),3 \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vflpsb %%v1,%%v1 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,16 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%3) \n\t" + + "vlef %%v16,0(%%r1,%3),0 \n\t" + "vlef %%v17,4(%%r1,%3),0 \n\t" + "vlef %%v16,8(%%r1,%3),1 \n\t" + "vlef %%v17,12(%%r1,%3),1 \n\t" + "vlef %%v16,16(%%r1,%3),2 \n\t" + "vlef %%v17,20(%%r1,%3),2 \n\t" + "vlef %%v16,24(%%r1,%3),3 \n\t" + "vlef %%v17,28(%%r1,%3),3 \n\t" + + "vlef %%v18,32(%%r1,%3),0 \n\t" + "vlef %%v19,36(%%r1,%3),0 \n\t" + "vlef %%v18,40(%%r1,%3),1 \n\t" + "vlef %%v19,44(%%r1,%3),1 \n\t" + "vlef %%v18,48(%%r1,%3),2 \n\t" + "vlef %%v19,52(%%r1,%3),2 \n\t" + "vlef %%v18,56(%%r1,%3),3 \n\t" + "vlef %%v19,30(%%r1,%3),3 \n\t" + + "vlef %%v20,64(%%r1,%3),0 \n\t" + "vlef %%v21,68(%%r1,%3),0 \n\t" + "vlef %%v20,72(%%r1,%3),1 \n\t" + "vlef %%v21,76(%%r1,%3),1 \n\t" + "vlef %%v20,80(%%r1,%3),2 \n\t" + "vlef %%v21,84(%%r1,%3),2 \n\t" + "vlef %%v20,88(%%r1,%3),3 \n\t" + "vlef %%v21,92(%%r1,%3),3 \n\t" + + "vlef %%v22,96(%%r1,%3),0 \n\t" + "vlef %%v23,100(%%r1,%3),0 \n\t" + "vlef %%v22,104(%%r1,%3),1 \n\t" + "vlef %%v23,108(%%r1,%3),1 \n\t" + "vlef %%v22,112(%%r1,%3),2 \n\t" + "vlef %%v23,116(%%r1,%3),2 \n\t" + "vlef %%v22,120(%%r1,%3),3 \n\t" + "vlef %%v23,124(%%r1,%3),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vlef %%v16,128(%%r1,%3),0 \n\t" + "vlef %%v17,132(%%r1,%3),0 \n\t" + "vlef %%v16,136(%%r1,%3),1 \n\t" + "vlef %%v17,140(%%r1,%3),1 \n\t" + "vlef %%v16,144(%%r1,%3),2 \n\t" + "vlef %%v17,148(%%r1,%3),2 \n\t" + "vlef %%v16,152(%%r1,%3),3 \n\t" + "vlef %%v17,156(%%r1,%3),3 \n\t" + + "vlef %%v18,160(%%r1,%3),0 \n\t" + "vlef %%v19,164(%%r1,%3),0 \n\t" + "vlef %%v18,168(%%r1,%3),1 \n\t" + "vlef %%v19,172(%%r1,%3),1 \n\t" + "vlef %%v18,176(%%r1,%3),2 \n\t" + "vlef %%v19,180(%%r1,%3),2 \n\t" + "vlef %%v18,184(%%r1,%3),3 \n\t" + "vlef %%v19,188(%%r1,%3),3 \n\t" + + "vlef %%v20,192(%%r1,%3),0 \n\t" + "vlef %%v21,196(%%r1,%3),0 \n\t" + "vlef %%v20,200(%%r1,%3),1 \n\t" + "vlef %%v21,204(%%r1,%3),1 \n\t" + "vlef %%v20,208(%%r1,%3),2 \n\t" + "vlef %%v21,212(%%r1,%3),2 \n\t" + "vlef %%v20,216(%%r1,%3),3 \n\t" + "vlef %%v21,220(%%r1,%3),3 \n\t" + + "vlef %%v22,224(%%r1,%3),0 \n\t" + "vlef %%v23,228(%%r1,%3),0 \n\t" + "vlef %%v22,232(%%r1,%3),1 \n\t" + "vlef %%v23,236(%%r1,%3),1 \n\t" + "vlef %%v22,240(%%r1,%3),2 \n\t" + "vlef %%v23,244(%%r1,%3),2 \n\t" + "vlef %%v22,248(%%r1,%3),3 \n\t" + "vlef %%v23,252(%%r1,%3),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v0,%%v3 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamax),"=m"(*amax) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return iamax; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + max = icamax_kernel_32(n1, x, &maxf); + + i = n1; + } + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (max + 1); + + } else { + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (max + 1); + } +} + + diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c new file mode 100644 index 000000000..b9c1ccd9c --- /dev/null +++ b/kernel/zarch/icamin.c @@ -0,0 +1,319 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) +{ + BLASLONG iamin; + + __asm__ volatile ( + "vlef %%v0,0(%3),0 \n\t" + "vlef %%v1,4(%3),0 \n\t" + "vlef %%v0,8(%3),1 \n\t" + "vlef %%v1,12(%3),1 \n\t" + "vlef %%v0,16(%3),2 \n\t" + "vlef %%v1,20(%3),2 \n\t" + "vlef %%v0,24(%3),3 \n\t" + "vlef %%v1,28(%3),3 \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vflpsb %%v1,%%v1 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,16 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vlef %%v16,0(%%r1,%3),0 \n\t" + "vlef %%v17,4(%%r1,%3),0 \n\t" + "vlef %%v16,8(%%r1,%3),1 \n\t" + "vlef %%v17,12(%%r1,%3),1 \n\t" + "vlef %%v16,16(%%r1,%3),2 \n\t" + "vlef %%v17,20(%%r1,%3),2 \n\t" + "vlef %%v16,24(%%r1,%3),3 \n\t" + "vlef %%v17,28(%%r1,%3),3 \n\t" + + "vlef %%v18,32(%%r1,%3),0 \n\t" + "vlef %%v19,36(%%r1,%3),0 \n\t" + "vlef %%v18,40(%%r1,%3),1 \n\t" + "vlef %%v19,44(%%r1,%3),1 \n\t" + "vlef %%v18,48(%%r1,%3),2 \n\t" + "vlef %%v19,52(%%r1,%3),2 \n\t" + "vlef %%v18,56(%%r1,%3),3 \n\t" + "vlef %%v19,30(%%r1,%3),3 \n\t" + + "vlef %%v20,64(%%r1,%3),0 \n\t" + "vlef %%v21,68(%%r1,%3),0 \n\t" + "vlef %%v20,72(%%r1,%3),1 \n\t" + "vlef %%v21,76(%%r1,%3),1 \n\t" + "vlef %%v20,80(%%r1,%3),2 \n\t" + "vlef %%v21,84(%%r1,%3),2 \n\t" + "vlef %%v20,88(%%r1,%3),3 \n\t" + "vlef %%v21,92(%%r1,%3),3 \n\t" + + "vlef %%v22,96(%%r1,%3),0 \n\t" + "vlef %%v23,100(%%r1,%3),0 \n\t" + "vlef %%v22,104(%%r1,%3),1 \n\t" + "vlef %%v23,108(%%r1,%3),1 \n\t" + "vlef %%v22,112(%%r1,%3),2 \n\t" + "vlef %%v23,116(%%r1,%3),2 \n\t" + "vlef %%v22,120(%%r1,%3),3 \n\t" + "vlef %%v23,124(%%r1,%3),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vlef %%v16,128(%%r1,%3),0 \n\t" + "vlef %%v17,132(%%r1,%3),0 \n\t" + "vlef %%v16,136(%%r1,%3),1 \n\t" + "vlef %%v17,140(%%r1,%3),1 \n\t" + "vlef %%v16,144(%%r1,%3),2 \n\t" + "vlef %%v17,148(%%r1,%3),2 \n\t" + "vlef %%v16,152(%%r1,%3),3 \n\t" + "vlef %%v17,156(%%r1,%3),3 \n\t" + + "vlef %%v18,160(%%r1,%3),0 \n\t" + "vlef %%v19,164(%%r1,%3),0 \n\t" + "vlef %%v18,168(%%r1,%3),1 \n\t" + "vlef %%v19,172(%%r1,%3),1 \n\t" + "vlef %%v18,176(%%r1,%3),2 \n\t" + "vlef %%v19,180(%%r1,%3),2 \n\t" + "vlef %%v18,184(%%r1,%3),3 \n\t" + "vlef %%v19,188(%%r1,%3),3 \n\t" + + "vlef %%v20,192(%%r1,%3),0 \n\t" + "vlef %%v21,196(%%r1,%3),0 \n\t" + "vlef %%v20,200(%%r1,%3),1 \n\t" + "vlef %%v21,204(%%r1,%3),1 \n\t" + "vlef %%v20,208(%%r1,%3),2 \n\t" + "vlef %%v21,212(%%r1,%3),2 \n\t" + "vlef %%v20,216(%%r1,%3),3 \n\t" + "vlef %%v21,220(%%r1,%3),3 \n\t" + + "vlef %%v22,224(%%r1,%3),0 \n\t" + "vlef %%v23,228(%%r1,%3),0 \n\t" + "vlef %%v22,232(%%r1,%3),1 \n\t" + "vlef %%v23,236(%%r1,%3),1 \n\t" + "vlef %%v22,240(%%r1,%3),2 \n\t" + "vlef %%v23,244(%%r1,%3),2 \n\t" + "vlef %%v22,248(%%r1,%3),3 \n\t" + "vlef %%v23,252(%%r1,%3),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v3,%%v0 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamin),"=m"(*amin) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return iamin; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0; + BLASLONG min = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(min); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + min = icamin_kernel_32(n1, x, &minf); + + i = n1; + } + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (min + 1); + + } else { + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (min + 1); + } +} + + diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index b67091148..aba880949 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -23,164 +23,173 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ + #include "common.h" #include #if defined(DOUBLE) - #define ABS fabs - #else - #define ABS fabsf - #endif - -/** - * Find maximum index - * Warning: requirements n>0 and n % 32 == 0 - * @param n - * @param x pointer to the vector - * @param maxf (out) maximum absolute value .( only for output ) - * @return index - */ -static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { - BLASLONG index; - __asm__( - "pfd 1, 0(%[ptr_x]) \n\t" - "sllg %%r0,%[n],3 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vleig %%v20,0,0 \n\t" - "vleig %%v20,1,1 \n\t" - "vleig %%v21,2,0 \n\t" - "vleig %%v21,3,1 \n\t" - "vleig %%v22,4,0 \n\t" - "vleig %%v22,5,1 \n\t" - "vleig %%v23,6,0 \n\t" - "vleig %%v23,7,1 \n\t" - "vrepig %%v4,8 \n\t" - "vzero %%v5 \n\t" - "vzero %%v18 \n\t" - "vzero %%v19 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" - "vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - "vfchdb %%v16,%%v25,%%v24 \n\t " - "vfchdb %%v17,%%v27,%%v26 \n\t " - "vsel %%v1,%%v21,%%v20,%%v16 \n\t" - "vsel %%v0,%%v25,%%v24,%%v16 \n\t" - "vsel %%v2,%%v23,%%v22,%%v17 \n\t" - "vsel %%v3,%%v27,%%v26,%%v17 \n\t" - "vfchdb %%v16,%%v29,%%v28 \n\t " - "vfchdb %%v17,%%v31,%%v30 \n\t" - "vsel %%v24,%%v21,%%v20,%%v16 \n\t" - "vsel %%v25,%%v29,%%v28,%%v16 \n\t" - "vsel %%v26,%%v23,%%v22,%%v17 \n\t" - "vsel %%v27,%%v31,%%v30,%%v17 \n\t" +static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) +{ + BLASLONG iamax; - "vfchdb %%v28, %%v3,%%v0 \n\t" - "vfchdb %%v29,%%v27, %%v25 \n\t" - "vsel %%v1,%%v2,%%v1,%%v28 \n\t" - "vsel %%v0,%%v3,%%v0,%%v28 \n\t" - "vsel %%v24,%%v26,%%v24,%%v29 \n\t" - "vsel %%v25,%%v27,%%v25,%%v29 \n\t" - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "vag %%v24,%%v24,%%v4 \n\t" - "vfchdb %%v16,%%v25 , %%v0 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" - "vfchdb %%v17, %%v29,%%v18 \n\t" - "vsel %%v19,%%v28,%%v19,%%v17 \n\t" - "vsel %%v18,%%v29,%%v18,%%v17 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - "vfchdb %%v16,%%v25,%%v24 \n\t " - "vfchdb %%v17,%%v27,%%v26 \n\t " - "vsel %%v1,%%v21,%%v20,%%v16 \n\t" - "vsel %%v0,%%v25,%%v24,%%v16 \n\t" - "vsel %%v2,%%v23,%%v22,%%v17 \n\t" - "vsel %%v3,%%v27,%%v26,%%v17 \n\t" - "vfchdb %%v16,%%v29,%%v28 \n\t " - "vfchdb %%v17,%%v31,%%v30 \n\t" - "vsel %%v24,%%v21,%%v20,%%v16 \n\t" - "vsel %%v25,%%v29,%%v28,%%v16 \n\t" - "vsel %%v26,%%v23,%%v22,%%v17 \n\t" - "vsel %%v27,%%v31,%%v30,%%v17 \n\t" + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,16 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "vleig %%v28,8,0 \n\t" + "vleig %%v28,9,1 \n\t" + "vleig %%v29,10,0 \n\t" + "vleig %%v29,11,1 \n\t" + "vleig %%v30,12,0 \n\t" + "vleig %%v30,13,1 \n\t" + "vleig %%v31,14,0 \n\t" + "vleig %%v31,15,1 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" - "vfchdb %%v28, %%v3,%%v0 \n\t" - "vfchdb %%v29,%%v27, %%v25 \n\t" - "vsel %%v1,%%v2,%%v1,%%v28 \n\t" - "vsel %%v0,%%v3,%%v0,%%v28 \n\t" - "vsel %%v24,%%v26,%%v24,%%v29 \n\t" - "vsel %%v25,%%v27,%%v25,%%v29 \n\t" - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - "vag %%v24,%%v24,%%v4 \n\t" - "vfchdb %%v16,%%v25 , %%v0 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" - "vfchdb %%v17, %%v29,%%v18 \n\t" - "vsel %%v19,%%v28,%%v19,%%v17 \n\t" - "vsel %%v18,%%v29,%%v18,%%v17 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "clgrjl %[ptr_tmp],%%r0,1b \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchdb %%v6,%%v20,%%v21 \n\t" + "vfchdb %%v7,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vrepg %%v26,%%v18,1 \n\t" - "vrepg %%v5,%%v19,1 \n\t" - "wfcdb %%v26,%%v18 \n\t" - "jne 2f \n\t" - "vsteg %%v18,%[maxf],0 \n\t" - "vmnlg %%v1,%%v5,%%v19 \n\t" - "j 3f \n\t" + "vfchdb %%v20,%%v16,%%v17 \n\t" + "vfchdb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "2: \n\t" - "wfchdb %%v16,%%v26,%%v18 \n\t" - "vsel %%v1,%%v5,%%v19,%%v16 \n\t" - "vsel %%v0,%%v26,%%v18,%%v16 \n\t" - "std %%f0,%[maxf] \n\t" - - "3: \n\t" - "vlgvg %[index],%%v1,0 \n\t" - : [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x) - : "cc", "r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return index; + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchdb %%v6,%%v20,%%v21 \n\t" + "vfchdb %%v7,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v16,%%v17 \n\t" + "vfchdb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamax),"=m"(*amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return iamax; } - - - BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; BLASLONG j = 0; - BLASLONG ix = 0; FLOAT maxf = 0.0; BLASLONG max = 0; @@ -191,7 +200,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG n1 = n & -32; if (n1 > 0) { - max = diamax_kernel_32_TUNED(n1, x, &maxf); + max = idamax_kernel_32(n1, x, &maxf); i = n1; } diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 8a7ff1659..3213efa4d 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -23,192 +23,185 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ + #include "common.h" #include #if defined(DOUBLE) - #define ABS fabs - #else - #define ABS fabsf - #endif -/** - * Find minimum index - * Warning: requirements n>0 and n % 32 == 0 - * @param n - * @param x pointer to the vector - * @param minf (out) minimum absolute value .( only for output ) - * @return minimum index - */ -static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { - BLASLONG index; - __asm__( - "pfd 1, 0(%[ptr_x]) \n\t" - "sllg %%r0,%[n],3 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vleig %%v20,0,0 \n\t" - "vleig %%v20,1,1 \n\t" - "vleig %%v21,2,0 \n\t" - "vleig %%v21,3,1 \n\t" - "vleig %%v22,4,0 \n\t" - "vleig %%v22,5,1 \n\t" - "vleig %%v23,6,0 \n\t" - "vleig %%v23,7,1 \n\t" - "vrepig %%v4,8 \n\t" - "vlrepg %%v18,0(%[ptr_x]) \n\t" - "vzero %%v5 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vzero %%v19 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" - "vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t" +static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) +{ + BLASLONG iamin; - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,16 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "vleig %%v28,8,0 \n\t" + "vleig %%v28,9,1 \n\t" + "vleig %%v29,10,0 \n\t" + "vleig %%v29,11,1 \n\t" + "vleig %%v30,12,0 \n\t" + "vleig %%v30,13,1 \n\t" + "vleig %%v31,14,0 \n\t" + "vleig %%v31,15,1 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" - "vfchdb %%v16,%%v24,%%v25 \n\t " - "vfchdb %%v17,%%v26 ,%%v27 \n\t " - "vsel %%v1,%%v21,%%v20,%%v16 \n\t" - "vsel %%v0,%%v25,%%v24,%%v16 \n\t" - "vsel %%v2,%%v23,%%v22,%%v17 \n\t" - "vsel %%v3,%%v27,%%v26,%%v17 \n\t" - "vfchdb %%v16,%%v28, %%v29 \n\t " - "vfchdb %%v17,%%v30,%%v31 \n\t" - "vsel %%v24,%%v21,%%v20,%%v16 \n\t" - "vsel %%v25,%%v29,%%v28,%%v16 \n\t" - "vsel %%v26,%%v23,%%v22,%%v17 \n\t" - "vsel %%v27,%%v31,%%v30,%%v17 \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchdb %%v6,%%v21,%%v20 \n\t" + "vfchdb %%v7,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + "vfchdb %%v20,%%v17,%%v16 \n\t" + "vfchdb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v28,%%v0 , %%v3 \n\t" - "vfchdb %%v29, %%v25,%%v27 \n\t" - "vsel %%v1,%%v2,%%v1,%%v28 \n\t" - "vsel %%v0,%%v3,%%v0,%%v28 \n\t" - "vsel %%v24,%%v26,%%v24,%%v29 \n\t" - "vsel %%v25,%%v27,%%v25,%%v29 \n\t" + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "vag %%v24,%%v24,%%v4 \n\t" + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" - "vfchdb %%v16, %%v0,%%v25 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" - "vfchdb %%v17,%%v18, %%v29 \n\t" - "vsel %%v19,%%v28,%%v19,%%v17 \n\t" - "vsel %%v18,%%v29,%%v18,%%v17 \n\t" + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchdb %%v6,%%v21,%%v20 \n\t" + "vfchdb %%v7,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" + "vfchdb %%v20,%%v17,%%v16 \n\t" + "vfchdb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v16,%%v24,%%v25 \n\t" - "vfchdb %%v17,%%v26 ,%%v27 \n\t" - "vsel %%v1,%%v21,%%v20,%%v16 \n\t" - "vsel %%v0,%%v25,%%v24,%%v16 \n\t" - "vsel %%v2,%%v23,%%v22,%%v17 \n\t" - "vsel %%v3,%%v27,%%v26,%%v17 \n\t" - "vfchdb %%v16,%%v28 ,%%v29 \n\t" - "vfchdb %%v17,%%v30,%%v31 \n\t" - "vsel %%v24,%%v21,%%v20,%%v16 \n\t" - "vsel %%v25,%%v29,%%v28,%%v16 \n\t" - "vsel %%v26,%%v23,%%v22,%%v17 \n\t" - "vsel %%v27,%%v31,%%v30,%%v17 \n\t" + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" - "vfchdb %%v28,%%v0 , %%v3 \n\t" - "vfchdb %%v29, %%v25,%%v27 \n\t" - "vsel %%v1,%%v2,%%v1,%%v28 \n\t" - "vsel %%v0,%%v3,%%v0,%%v28 \n\t" - "vsel %%v24,%%v26,%%v24,%%v29 \n\t" - "vsel %%v25,%%v27,%%v25,%%v29 \n\t" - - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - "vag %%v24,%%v24,%%v4 \n\t" - - "vfchdb %%v16, %%v0,%%v25 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" - - "vfchdb %%v17,%%v18, %%v29 \n\t" - "vsel %%v19,%%v28,%%v19,%%v17 \n\t" - "vsel %%v18,%%v29,%%v18,%%v17 \n\t" - - "vag %%v5,%%v5,%%v4 \n\t" - - "clgrjl %[ptr_tmp],%%r0,1b \n\t" - - - "vrepg %%v26,%%v18,1 \n\t" - "vrepg %%v5,%%v19,1 \n\t" - "wfcdb %%v26,%%v18 \n\t" - "jne 2f \n\t" - "vsteg %%v18,%[minf],0 \n\t" - "vmnlg %%v1,%%v5,%%v19 \n\t" - "j 3f \n\t" - - "2: \n\t" - "wfchdb %%v16,%%v18 ,%%v26 \n\t " - "vsel %%v1,%%v5,%%v19,%%v16 \n\t" - "vsel %%v0,%%v26,%%v18,%%v16 \n\t" - "std %%f0,%[minf] \n\t" - - "3: \n\t" - "vlgvg %[index],%%v1,0 \n\t" - - : [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x) - : "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - - ); - - return index; + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamin),"=m"(*amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + return iamin; } - - - + BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; BLASLONG j = 0; - BLASLONG ix = 0; - BLASLONG min = 0; FLOAT minf = 0.0; - + BLASLONG min = 0; + if (n <= 0 || inc_x <= 0) return (min); - minf = ABS(x[0]); //index's not incremented,though it will make first comparision redundant + if (inc_x == 1) { BLASLONG n1 = n & -32; if (n1 > 0) { - min = diamin_kernel_32(n1, x, &minf); + min = idamin_kernel_32(n1, x, &minf); + i = n1; } diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c new file mode 100644 index 000000000..26fff4eb0 --- /dev/null +++ b/kernel/zarch/idmax.c @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) +{ + BLASLONG imax; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,16 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "vleig %%v28,8,0 \n\t" + "vleig %%v28,9,1 \n\t" + "vleig %%v29,10,0 \n\t" + "vleig %%v29,11,1 \n\t" + "vleig %%v30,12,0 \n\t" + "vleig %%v30,13,1 \n\t" + "vleig %%v31,14,0 \n\t" + "vleig %%v31,15,1 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchdb %%v6,%%v20,%%v21 \n\t" + "vfchdb %%v7,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v16,%%v17 \n\t" + "vfchdb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchdb %%v6,%%v20,%%v21 \n\t" + "vfchdb %%v7,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v16,%%v17 \n\t" + "vfchdb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(imax),"=m"(*max) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return imax; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + max = idmax_kernel_32(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (x[i] > maxf) { + max = i; + maxf = x[i]; + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + max = j + 1; + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + max = j + 2; + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + max = j + 3; + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c new file mode 100644 index 000000000..570b33a15 --- /dev/null +++ b/kernel/zarch/idmin.c @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) +{ + BLASLONG imin; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,16 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "vleig %%v28,8,0 \n\t" + "vleig %%v28,9,1 \n\t" + "vleig %%v29,10,0 \n\t" + "vleig %%v29,11,1 \n\t" + "vleig %%v30,12,0 \n\t" + "vleig %%v30,13,1 \n\t" + "vleig %%v31,14,0 \n\t" + "vleig %%v31,15,1 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchdb %%v6,%%v21,%%v20 \n\t" + "vfchdb %%v7,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v17,%%v16 \n\t" + "vfchdb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchdb %%v6,%%v21,%%v20 \n\t" + "vfchdb %%v7,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v17,%%v16 \n\t" + "vfchdb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(imin),"=m"(*min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return imin; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; + + if (n <= 0 || inc_x <= 0) return (min); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + min = idmin_kernel_32(n1, x, &minf); + + i = n1; + } + + while (i < n) { + if (x[i] < minf) { + min = i; + minf = x[i]; + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] < minf) { + min = j; + minf = x[i]; + } + if (x[i + inc_x] < minf) { + min = j + 1; + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + min = j + 2; + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + min = j + 3; + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + min = j; + minf = x[i]; + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c new file mode 100644 index 000000000..95a665b10 --- /dev/null +++ b/kernel/zarch/isamax.c @@ -0,0 +1,299 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) +{ + BLASLONG iamax; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,32 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "vleif %%v28,16,0 \n\t" + "vleif %%v28,17,1 \n\t" + "vleif %%v28,18,2 \n\t" + "vleif %%v28,19,3 \n\t" + "vleif %%v29,20,0 \n\t" + "vleif %%v29,21,1 \n\t" + "vleif %%v29,22,2 \n\t" + "vleif %%v29,23,3 \n\t" + "vleif %%v30,24,0 \n\t" + "vleif %%v30,25,1 \n\t" + "vleif %%v30,26,2 \n\t" + "vleif %%v30,27,3 \n\t" + "vleif %%v31,28,0 \n\t" + "vleif %%v31,29,1 \n\t" + "vleif %%v31,30,2 \n\t" + "vleif %%v31,31,3 \n\t" + "srlg %%r0,%2,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchsb %%v7,%%v20,%%v21 \n\t" + "vfchsb %%v8,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v16,%%v17 \n\t" + "vfchsb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchsb %%v7,%%v20,%%v21 \n\t" + "vfchsb %%v8,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v16,%%v17 \n\t" + "vfchsb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v0,%%v3 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamax),"=m"(*amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return iamax; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + max = isamax_kernel_64(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c new file mode 100644 index 000000000..640fc02c9 --- /dev/null +++ b/kernel/zarch/isamin.c @@ -0,0 +1,299 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) +{ + BLASLONG iamin; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,32 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "vleif %%v28,16,0 \n\t" + "vleif %%v28,17,1 \n\t" + "vleif %%v28,18,2 \n\t" + "vleif %%v28,19,3 \n\t" + "vleif %%v29,20,0 \n\t" + "vleif %%v29,21,1 \n\t" + "vleif %%v29,22,2 \n\t" + "vleif %%v29,23,3 \n\t" + "vleif %%v30,24,0 \n\t" + "vleif %%v30,25,1 \n\t" + "vleif %%v30,26,2 \n\t" + "vleif %%v30,27,3 \n\t" + "vleif %%v31,28,0 \n\t" + "vleif %%v31,29,1 \n\t" + "vleif %%v31,30,2 \n\t" + "vleif %%v31,31,3 \n\t" + "srlg %%r0,%2,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchsb %%v7,%%v21,%%v20 \n\t" + "vfchsb %%v8,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v17,%%v16 \n\t" + "vfchsb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchsb %%v7,%%v21,%%v20 \n\t" + "vfchsb %%v8,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v17,%%v16 \n\t" + "vfchsb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v3,%%v0 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamin),"=m"(*amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return iamin; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; + + if (n <= 0 || inc_x <= 0) return (min); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + min = isamin_kernel_64(n1, x, &minf); + + i = n1; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c new file mode 100644 index 000000000..0eb350315 --- /dev/null +++ b/kernel/zarch/ismax.c @@ -0,0 +1,275 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) +{ + BLASLONG imax; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,32 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "vleif %%v28,16,0 \n\t" + "vleif %%v28,17,1 \n\t" + "vleif %%v28,18,2 \n\t" + "vleif %%v28,19,3 \n\t" + "vleif %%v29,20,0 \n\t" + "vleif %%v29,21,1 \n\t" + "vleif %%v29,22,2 \n\t" + "vleif %%v29,23,3 \n\t" + "vleif %%v30,24,0 \n\t" + "vleif %%v30,25,1 \n\t" + "vleif %%v30,26,2 \n\t" + "vleif %%v30,27,3 \n\t" + "vleif %%v31,28,0 \n\t" + "vleif %%v31,29,1 \n\t" + "vleif %%v31,30,2 \n\t" + "vleif %%v31,31,3 \n\t" + "srlg %%r0,%2,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchsb %%v7,%%v20,%%v21 \n\t" + "vfchsb %%v8,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v16,%%v17 \n\t" + "vfchsb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchsb %%v7,%%v20,%%v21 \n\t" + "vfchsb %%v8,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v16,%%v17 \n\t" + "vfchsb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v0,%%v3 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(imax),"=m"(*max) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return imax; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + max = ismax_kernel_64(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (x[i] > maxf) { + max = i; + maxf = x[i]; + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + max = j + 1; + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + max = j + 2; + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + max = j + 3; + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c new file mode 100644 index 000000000..f050db8cb --- /dev/null +++ b/kernel/zarch/ismin.c @@ -0,0 +1,275 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) +{ + BLASLONG imin; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,32 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "vleif %%v28,16,0 \n\t" + "vleif %%v28,17,1 \n\t" + "vleif %%v28,18,2 \n\t" + "vleif %%v28,19,3 \n\t" + "vleif %%v29,20,0 \n\t" + "vleif %%v29,21,1 \n\t" + "vleif %%v29,22,2 \n\t" + "vleif %%v29,23,3 \n\t" + "vleif %%v30,24,0 \n\t" + "vleif %%v30,25,1 \n\t" + "vleif %%v30,26,2 \n\t" + "vleif %%v30,27,3 \n\t" + "vleif %%v31,28,0 \n\t" + "vleif %%v31,29,1 \n\t" + "vleif %%v31,30,2 \n\t" + "vleif %%v31,31,3 \n\t" + "srlg %%r0,%2,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchsb %%v7,%%v21,%%v20 \n\t" + "vfchsb %%v8,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v17,%%v16 \n\t" + "vfchsb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchsb %%v7,%%v21,%%v20 \n\t" + "vfchsb %%v8,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v17,%%v16 \n\t" + "vfchsb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v3,%%v0 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(imin),"=m"(*min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return imin; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; + + if (n <= 0 || inc_x <= 0) return (min); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + min = ismin_kernel_64(n1, x, &minf); + + i = n1; + } + + while (i < n) { + if (x[i] < minf) { + min = i; + minf = x[i]; + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] < minf) { + min = j; + minf = x[i]; + } + if (x[i + inc_x] < minf) { + min = j + 1; + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + min = j + 2; + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + min = j + 3; + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + min = j; + minf = x[i]; + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 216c3414a..bf5f621a7 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -24,190 +24,165 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" #include -#define ABS fabs -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) +static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) +{ + BLASLONG iamax; + __asm__ volatile ( + "vleg %%v0,0(%3),0 \n\t" + "vleg %%v1,8(%3),0 \n\t" + "vleg %%v0,16(%3),1 \n\t" + "vleg %%v1,24(%3),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v1,%%v1 \n\t" + "vfadb %%v0,%%v0,%%v1 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,8 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "srlg %%r0,%2,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" - -/** - * Find maximum index - * Warning: requirements n>0 and n % 16 == 0 - * @param n - * @param x pointer to the vector - * @param maxf (out) maximum absolute value .( only for output ) - * @return index - */ -static BLASLONG ziamax_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { - BLASLONG index; - __asm__( - "pfd 1, 0(%[ptr_x]) \n\t" - "vleig %%v16,0,0 \n\t" - "vleig %%v16,1,1 \n\t" - "vleig %%v17,2,0 \n\t" - "vleig %%v17,3,1 \n\t" - "vleig %%v18,4,0 \n\t" - "vleig %%v18,5,1 \n\t" - "vleig %%v19,6,0 \n\t" - "vleig %%v19,7,1 \n\t" - "vleig %%v20,8,0 \n\t" - "vleig %%v20,9,1 \n\t" - "vleig %%v21,10,0 \n\t" - "vleig %%v21,11,1 \n\t" - "vleig %%v22,12,0 \n\t" - "vleig %%v22,13,1 \n\t" - "vleig %%v23,14,0 \n\t" - "vleig %%v23,15,1 \n\t" - - - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vzero %%v6 \n\t" - "vzero %%v7 \n\t" - "vrepig %%v4,16 \n\t" - "vzero %%v5 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" + "vleg %%v16,0(%%r1,%3),0 \n\t" + "vleg %%v17,8(%%r1,%3),0 \n\t" + "vleg %%v16,16(%%r1,%3),1 \n\t" + "vleg %%v17,24(%%r1,%3),1 \n\t" + "vleg %%v18,32(%%r1,%3),0 \n\t" + "vleg %%v19,40(%%r1,%3),0 \n\t" + "vleg %%v18,48(%%r1,%3),1 \n\t" + "vleg %%v19,56(%%r1,%3),1 \n\t" + "vleg %%v20,64(%%r1,%3),0 \n\t" + "vleg %%v21,72(%%r1,%3),0 \n\t" + "vleg %%v20,80(%%r1,%3),1 \n\t" + "vleg %%v21,88(%%r1,%3),1 \n\t" + "vleg %%v22,96(%%r1,%3),0 \n\t" + "vleg %%v23,104(%%r1,%3),0 \n\t" + "vleg %%v22,112(%%r1,%3),1 \n\t" + "vleg %%v23,120(%%r1,%3),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" - "vleg %%v24 , 0(%[ptr_tmp]),0 \n\t" - "vleg %%v25 , 8(%[ptr_tmp]),0 \n\t" - "vleg %%v24 , 16(%[ptr_tmp]),1 \n\t" - "vleg %%v25 , 24(%[ptr_tmp]),1 \n\t" - "vleg %%v26 , 32(%[ptr_tmp]),0 \n\t" - "vleg %%v27 , 40(%[ptr_tmp]),0 \n\t" - "vleg %%v26 , 48(%[ptr_tmp]),1 \n\t" - "vleg %%v27 , 56(%[ptr_tmp]),1 \n\t" - "vleg %%v28 , 64(%[ptr_tmp]),0 \n\t" - "vleg %%v29 , 72(%[ptr_tmp]),0 \n\t" - "vleg %%v28 , 80(%[ptr_tmp]),1 \n\t" - "vleg %%v29 , 88(%[ptr_tmp]),1 \n\t" - "vleg %%v30 , 96(%[ptr_tmp]),0 \n\t" - "vleg %%v31 ,104(%[ptr_tmp]),0 \n\t" - "vleg %%v30 ,112(%[ptr_tmp]),1 \n\t" - "vleg %%v31 ,120(%[ptr_tmp]),1 \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v0,%%v24,%%v25 \n\t" - "vfadb %%v1,%%v26,%%v27 \n\t" - "vfadb %%v2,%%v28,%%v29 \n\t" - "vfadb %%v3,%%v30,%%v31 \n\t" - - - "vleg %%v24 , 128(%[ptr_tmp]),0 \n\t" - "vleg %%v25 , 136(%[ptr_tmp]),0 \n\t" - "vleg %%v24 , 144(%[ptr_tmp]),1 \n\t" - "vleg %%v25 , 152(%[ptr_tmp]),1 \n\t" - "vleg %%v26 , 160(%[ptr_tmp]),0 \n\t" - "vleg %%v27 , 168(%[ptr_tmp]),0 \n\t" - "vleg %%v26 , 176(%[ptr_tmp]),1 \n\t" - "vleg %%v27 , 184(%[ptr_tmp]),1 \n\t" - "vleg %%v28 , 192(%[ptr_tmp]),0 \n\t" - "vleg %%v29 , 200(%[ptr_tmp]),0 \n\t" - "vleg %%v28 , 208(%[ptr_tmp]),1 \n\t" - "vleg %%v29 , 216(%[ptr_tmp]),1 \n\t" - "vleg %%v30 , 224(%[ptr_tmp]),0 \n\t" - "vleg %%v31 , 232(%[ptr_tmp]),0 \n\t" - "vleg %%v30 , 240(%[ptr_tmp]),1 \n\t" - "vleg %%v31 , 248(%[ptr_tmp]),1 \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v24,%%v24,%%v25 \n\t" - "vfadb %%v26,%%v26,%%v27 \n\t" - "vfadb %%v28,%%v28,%%v29 \n\t" - "vfadb %%v30,%%v30,%%v31 \n\t" - - "vfchdb %%v25,%%v1,%%v0 \n\t" - "vsel %%v29,%%v17,%%v16,%%v25 \n\t" - "vsel %%v31,%%v1,%%v0,%%v25 \n\t" - - "vfchdb %%v27,%%v3,%%v2 \n\t " - "vsel %%v0,%%v19,%%v18,%%v27 \n\t" - "vsel %%v1,%%v3,%%v2,%%v27 \n\t" - - "vfchdb %%v25,%%v26,%%v24 \n\t" - "vsel %%v2,%%v21,%%v20,%%v25 \n\t" - "vsel %%v3,%%v26,%%v24,%%v25 \n\t" - - "vfchdb %%v27,%%v30,%%v28 \n\t" - "vsel %%v25,%%v23,%%v22,%%v27 \n\t" - "vsel %%v27,%%v30,%%v28,%%v27 \n\t" - - "vfchdb %%v24, %%v1,%%v31 \n\t" - "vsel %%v26,%%v0,%%v29,%%v24 \n\t" - "vsel %%v28,%%v1,%%v31,%%v24 \n\t" - - "vfchdb %%v30, %%v27,%%v3 \n\t" - "vsel %%v29,%%v25,%%v2,%%v30 \n\t" - "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" - - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - - "vfchdb %%v0, %%v31,%%v28 \n\t" - "vsel %%v25,%%v29,%%v26,%%v0 \n\t" - "vsel %%v27,%%v31,%%v28,%%v0 \n\t" - - "vag %%v25,%%v25,%%v5 \n\t" - - //cmp with previous - "vfchdb %%v30, %%v27,%%v6 \n\t" - "vsel %%v7,%%v25,%%v7,%%v30 \n\t" - "vsel %%v6,%%v27,%%v6,%%v30 \n\t" - - "vag %%v5,%%v5,%%v4 \n\t" - - "clgrjl %[ptr_tmp],%%r0,1b \n\t" + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - //xtract index - "vrepg %%v26,%%v6,1 \n\t" - "vrepg %%v5,%%v7,1 \n\t" - "wfcdb %%v26,%%v6 \n\t" - "jne 2f \n\t" - "vsteg %%v6,%[maxf],0 \n\t" - "vmnlg %%v1,%%v5,%%v7 \n\t" - "vlgvg %[index],%%v1,0 \n\t" - "j 3 \n\t" - "2: \n\t" - "wfchdb %%v16,%%v26,%%v6 \n\t" - "vsel %%v1,%%v5,%%v7,%%v16 \n\t" - "vsel %%v0,%%v26,%%v6,%%v16 \n\t" - "vlgvg %[index],%%v1,0 \n\t" - "std %%f0,%[maxf] \n\t" - "3: \n\t" - : [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x) - : "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" - ); - return index; + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + "vleg %%v16,128(%%r1,%3),0 \n\t" + "vleg %%v17,136(%%r1,%3),0 \n\t" + "vleg %%v16,144(%%r1,%3),1 \n\t" + "vleg %%v17,152(%%r1,%3),1 \n\t" + "vleg %%v18,160(%%r1,%3),0 \n\t" + "vleg %%v19,168(%%r1,%3),0 \n\t" + "vleg %%v18,176(%%r1,%3),1 \n\t" + "vleg %%v19,184(%%r1,%3),1 \n\t" + "vleg %%v20,192(%%r1,%3),0 \n\t" + "vleg %%v21,200(%%r1,%3),0 \n\t" + "vleg %%v20,208(%%r1,%3),1 \n\t" + "vleg %%v21,216(%%r1,%3),1 \n\t" + "vleg %%v22,224(%%r1,%3),0 \n\t" + "vleg %%v23,232(%%r1,%3),0 \n\t" + "vleg %%v22,240(%%r1,%3),1 \n\t" + "vleg %%v23,248(%%r1,%3),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamax),"=m"(*amax) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return iamax; } - - - - - BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; @@ -223,9 +198,9 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG n1 = n & -16; if (n1 > 0) { - max = ziamax_kernel_16_TUNED(n1, x, &maxf); + max = izamax_kernel_16(n1, x, &maxf); + i = n1; - ix = n1 << 1; } while(i < n) @@ -260,7 +235,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } return (max + 1); } - } diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 9b2a653a7..3636e8fdf 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -24,253 +24,217 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" #include -#define ABS fabs -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -/** - * Find minimum index - * Warning: requirements n>0 and n % 16 == 0 - * @param n - * @param x pointer to the vector - * @param minf (out) minimum absolute value .( only for output ) - * @return minimum index - */ -static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { - BLASLONG index ; - __asm__( - "pfd 1, 0(%[ptr_x]) \n\t" - "vleig %%v16,0,0 \n\t" - "vleig %%v16,1,1 \n\t" - "vleig %%v17,2,0 \n\t" - "vleig %%v17,3,1 \n\t" - "vleig %%v18,4,0 \n\t" - "vleig %%v18,5,1 \n\t" - "vleig %%v19,6,0 \n\t" - "vleig %%v19,7,1 \n\t" - "vleig %%v20,8,0 \n\t" - "vleig %%v20,9,1 \n\t" - "vleig %%v21,10,0 \n\t" - "vleig %%v21,11,1 \n\t" - "vleig %%v22,12,0 \n\t" - "vleig %%v22,13,1 \n\t" - "vleig %%v23,14,0 \n\t" - "vleig %%v23,15,1 \n\t" - "ld %%f6,0(%[ptr_x]) \n\t" - "lpdbr %%f6,%%f6 \n\t" - "ld %%f7,8(%[ptr_x]) \n\t" - "lpdbr %%f7,%%f7 \n\t" - "adbr %%f6,%%f7 \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vrepg %%v6,%%v6,0 \n\t" - "vzero %%v7 \n\t" - "vrepig %%v4,16 \n\t" - "vzero %%v5 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" +static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) +{ + BLASLONG iamin; + + __asm__ volatile ( + "vleg %%v0,0(%3),0 \n\t" + "vleg %%v1,8(%3),0 \n\t" + "vleg %%v0,16(%3),1 \n\t" + "vleg %%v1,24(%3),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v1,%%v1 \n\t" + "vfadb %%v0,%%v0,%%v1 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,8 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "srlg %%r0,%2,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vleg %%v16,0(%%r1,%3),0 \n\t" + "vleg %%v17,8(%%r1,%3),0 \n\t" + "vleg %%v16,16(%%r1,%3),1 \n\t" + "vleg %%v17,24(%%r1,%3),1 \n\t" + "vleg %%v18,32(%%r1,%3),0 \n\t" + "vleg %%v19,40(%%r1,%3),0 \n\t" + "vleg %%v18,48(%%r1,%3),1 \n\t" + "vleg %%v19,56(%%r1,%3),1 \n\t" + "vleg %%v20,64(%%r1,%3),0 \n\t" + "vleg %%v21,72(%%r1,%3),0 \n\t" + "vleg %%v20,80(%%r1,%3),1 \n\t" + "vleg %%v21,88(%%r1,%3),1 \n\t" + "vleg %%v22,96(%%r1,%3),0 \n\t" + "vleg %%v23,104(%%r1,%3),0 \n\t" + "vleg %%v22,112(%%r1,%3),1 \n\t" + "vleg %%v23,120(%%r1,%3),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" - "vleg %%v24 , 0(%[ptr_tmp]),0 \n\t" - "vleg %%v25 , 8(%[ptr_tmp]),0 \n\t" - "vleg %%v24 , 16(%[ptr_tmp]),1 \n\t" - "vleg %%v25 , 24(%[ptr_tmp]),1 \n\t" - "vleg %%v26 , 32(%[ptr_tmp]),0 \n\t" - "vleg %%v27 , 40(%[ptr_tmp]),0 \n\t" - "vleg %%v26 , 48(%[ptr_tmp]),1 \n\t" - "vleg %%v27 , 56(%[ptr_tmp]),1 \n\t" - "vleg %%v28 , 64(%[ptr_tmp]),0 \n\t" - "vleg %%v29 , 72(%[ptr_tmp]),0 \n\t" - "vleg %%v28 , 80(%[ptr_tmp]),1 \n\t" - "vleg %%v29 , 88(%[ptr_tmp]),1 \n\t" - "vleg %%v30 , 96(%[ptr_tmp]),0 \n\t" - "vleg %%v31 ,104(%[ptr_tmp]),0 \n\t" - "vleg %%v30 ,112(%[ptr_tmp]),1 \n\t" - "vleg %%v31 ,120(%[ptr_tmp]),1 \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v0,%%v24,%%v25 \n\t" - "vfadb %%v1,%%v26,%%v27 \n\t" - "vfadb %%v2,%%v28,%%v29 \n\t" - "vfadb %%v3,%%v30,%%v31 \n\t" - - - "vleg %%v24 ,128(%[ptr_tmp]),0 \n\t" - "vleg %%v25 ,136(%[ptr_tmp]),0 \n\t" - "vleg %%v24 ,144(%[ptr_tmp]),1 \n\t" - "vleg %%v25 ,152(%[ptr_tmp]),1 \n\t" - "vleg %%v26 ,160(%[ptr_tmp]),0 \n\t" - "vleg %%v27 ,168(%[ptr_tmp]),0 \n\t" - "vleg %%v26 ,176(%[ptr_tmp]),1 \n\t" - "vleg %%v27 ,184(%[ptr_tmp]),1 \n\t" - "vleg %%v28 ,192(%[ptr_tmp]),0 \n\t" - "vleg %%v29 ,200(%[ptr_tmp]),0 \n\t" - "vleg %%v28 ,208(%[ptr_tmp]),1 \n\t" - "vleg %%v29 ,216(%[ptr_tmp]),1 \n\t" - "vleg %%v30 ,224(%[ptr_tmp]),0 \n\t" - "vleg %%v31 ,232(%[ptr_tmp]),0 \n\t" - "vleg %%v30 ,240(%[ptr_tmp]),1 \n\t" - "vleg %%v31 ,248(%[ptr_tmp]),1 \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v24,%%v24,%%v25 \n\t" - "vfadb %%v26,%%v26,%%v27 \n\t" - "vfadb %%v28,%%v28,%%v29 \n\t" - "vfadb %%v30,%%v30,%%v31 \n\t" - - - "vfchdb %%v25,%%v0 ,%%v1 \n\t" - "vsel %%v29,%%v17,%%v16,%%v25 \n\t" - "vsel %%v31,%%v1,%%v0,%%v25 \n\t" - - "vfchdb %%v27,%%v2,%%v3 \n\t" - "vsel %%v0,%%v19,%%v18,%%v27 \n\t" - "vsel %%v1,%%v3,%%v2,%%v27 \n\t" - - "vfchdb %%v25,%%v24,%%v26 \n\t" - "vsel %%v2,%%v21,%%v20,%%v25 \n\t" - "vsel %%v3,%%v26,%%v24,%%v25 \n\t" - - "vfchdb %%v27,%%v28,%%v30 \n\t" - "vsel %%v25,%%v23,%%v22,%%v27 \n\t" - "vsel %%v27,%%v30,%%v28,%%v27 \n\t" - - "vfchdb %%v24,%%v31, %%v1 \n\t" - "vsel %%v26,%%v0,%%v29,%%v24 \n\t" - "vsel %%v28,%%v1,%%v31,%%v24 \n\t" - - "vfchdb %%v30,%%v3, %%v27 \n\t" - "vsel %%v29,%%v25,%%v2,%%v30 \n\t" - "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" - - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - - "vfchdb %%v0,%%v28, %%v31 \n\t" - "vsel %%v25,%%v29,%%v26,%%v0 \n\t" - "vsel %%v27,%%v31,%%v28,%%v0 \n\t" - - "vag %%v25,%%v25,%%v5 \n\t" - - //cmp with previous - "vfchdb %%v30,%%v6 , %%v27 \n\t" - "vsel %%v7,%%v25,%%v7,%%v30 \n\t" - "vsel %%v6,%%v27,%%v6,%%v30 \n\t" - - "vag %%v5,%%v5,%%v4 \n\t" - - "clgrjl %[ptr_tmp],%%r0,1b \n\t" + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - //xtract index - "vrepg %%v26,%%v6,1 \n\t" - "vrepg %%v5,%%v7,1 \n\t" - "wfcdb %%v26,%%v6 \n\t" - "jne 2f \n\t" - "vsteg %%v6,%[minf],0 \n\t" - "vmnlg %%v1,%%v5,%%v7 \n\t" - "vlgvg %[index],%%v1,0 \n\t" - "j 3f \n\t" - "2: \n\t" - "wfchdb %%v16,%%v6 ,%%v26 \n\t" - "vsel %%v1,%%v5,%%v7,%%v16 \n\t" - "vsel %%v0,%%v26,%%v6,%%v16 \n\t" - "vlgvg %[index],%%v1,0 \n\t" - "std %%f0,%[minf] \n\t" - "3: \n\t" + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" - : [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x) - : "cc","r0","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" - ); + "vleg %%v16,128(%%r1,%3),0 \n\t" + "vleg %%v17,136(%%r1,%3),0 \n\t" + "vleg %%v16,144(%%r1,%3),1 \n\t" + "vleg %%v17,152(%%r1,%3),1 \n\t" + "vleg %%v18,160(%%r1,%3),0 \n\t" + "vleg %%v19,168(%%r1,%3),0 \n\t" + "vleg %%v18,176(%%r1,%3),1 \n\t" + "vleg %%v19,184(%%r1,%3),1 \n\t" + "vleg %%v20,192(%%r1,%3),0 \n\t" + "vleg %%v21,200(%%r1,%3),0 \n\t" + "vleg %%v20,208(%%r1,%3),1 \n\t" + "vleg %%v21,216(%%r1,%3),1 \n\t" + "vleg %%v22,224(%%r1,%3),0 \n\t" + "vleg %%v23,232(%%r1,%3),0 \n\t" + "vleg %%v22,240(%%r1,%3),1 \n\t" + "vleg %%v23,248(%%r1,%3),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - return index; + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamin),"=m"(*amin) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return iamin; } - - - - - BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf; - BLASLONG min=0; + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0; + BLASLONG min = 0; BLASLONG inc_x2; if (n <= 0 || inc_x <= 0) return(min); - - + if (inc_x == 1) { - BLASLONG n1 = n & -16; - if (n1 > 0) { + BLASLONG n1 = n & -16; + if (n1 > 0) { + + min = izamin_kernel_16(n1, x, &minf); - min = ziamin_kernel_16_TUNED(n1, x, &minf); i = n1; - ix = n1 << 1; - } - else { - //assign minf - minf = CABS1(x,0); - ix += 2; - i++; - } + } - while(i < n) + while(i < n) + { + if( CABS1(x,ix) < minf ) { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += 2; - i++; + min = i; + minf = CABS1(x,ix); } + ix += 2; + i++; + } return (min + 1); } else { - inc_x2 = 2 * inc_x; + inc_x2 = 2 * inc_x; - minf = CABS1(x,0); + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } ix += inc_x2; i++; - - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } + } return (min + 1); } - } diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c new file mode 100644 index 000000000..1025cfcbf --- /dev/null +++ b/kernel/zarch/samax.c @@ -0,0 +1,210 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpsb %%v0,%%v0 \n\t" + "srlg %%r0,%1,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v24,%%v16,%%v17 \n\t" + "vfchsb %%v25,%%v18,%%v19 \n\t" + "vfchsb %%v26,%%v20,%%v21 \n\t" + "vfchsb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v24,%%v25 \n\t" + "vfchsb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v24,%%v16,%%v17 \n\t" + "vfchsb %%v25,%%v18,%%v19 \n\t" + "vfchsb %%v26,%%v20,%%v21 \n\t" + "vfchsb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v24,%%v25 \n\t" + "vfchsb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + maxf = samax_kernel_64(n1, x); + + i = n1; + } + else + { + maxf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); + + } else { + + maxf=ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c new file mode 100644 index 000000000..3b8f03e6a --- /dev/null +++ b/kernel/zarch/samin.c @@ -0,0 +1,210 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpsb %%v0,%%v0 \n\t" + "srlg %%r0,%1,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vfchsb %%v26,%%v21,%%v20 \n\t" + "vfchsb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v25,%%v24 \n\t" + "vfchsb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vfchsb %%v26,%%v21,%%v20 \n\t" + "vfchsb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v25,%%v24 \n\t" + "vfchsb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + minf = samin_kernel_64(n1, x); + + i = n1; + } + else + { + minf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); + + } else { + + minf=ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/sasum.c b/kernel/zarch/sasum.c new file mode 100644 index 000000000..2c59ab2e5 --- /dev/null +++ b/kernel/zarch/sasum.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) +{ + FLOAT asum; + + __asm__ ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "srlg %%r0,%1,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + "vl %%v20, 64(%%r1,%2) \n\t" + "vl %%v21, 80(%%r1,%2) \n\t" + "vl %%v22, 96(%%r1,%2) \n\t" + "vl %%v23, 112(%%r1,%2) \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfasb %%v0,%%v0,%%v16 \n\t" + "vfasb %%v1,%%v1,%%v17 \n\t" + "vfasb %%v2,%%v2,%%v18 \n\t" + "vfasb %%v3,%%v3,%%v19 \n\t" + "vfasb %%v0,%%v0,%%v20 \n\t" + "vfasb %%v1,%%v1,%%v21 \n\t" + "vfasb %%v2,%%v2,%%v22 \n\t" + "vfasb %%v3,%%v3,%%v23 \n\t" + + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + "vl %%v20, 192(%%r1,%2) \n\t" + "vl %%v21, 208(%%r1,%2) \n\t" + "vl %%v22, 224(%%r1,%2) \n\t" + "vl %%v23, 240(%%r1,%2) \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfasb %%v0,%%v0,%%v16 \n\t" + "vfasb %%v1,%%v1,%%v17 \n\t" + "vfasb %%v2,%%v2,%%v18 \n\t" + "vfasb %%v3,%%v3,%%v19 \n\t" + "vfasb %%v0,%%v0,%%v20 \n\t" + "vfasb %%v1,%%v1,%%v21 \n\t" + "vfasb %%v2,%%v2,%%v22 \n\t" + "vfasb %%v3,%%v3,%%v23 \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vfasb %%v0,%%v0,%%v2 \n\t" + "vfasb %%v0,%%v0,%%v3 \n\t" + "veslg %%v1,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vrepf %%v1,%%v0,2 \n\t" + "aebr %%f0,%%f1 \n\t" + "ler %0,%%f0 " + :"=f"(asum) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" + ); + + return asum; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT sumf = 0.0; + BLASLONG n1; + + if (n <= 0 || inc_x <= 0) return sumf; + + if (inc_x == 1) { + + n1 = n & -64; + + if (n1 > 0) { + + sumf = sasum_kernel_64(n1, x); + i = n1; + } + + while (i < n) { + sumf += ABS(x[i]); + i++; + } + + } else { + BLASLONG n1 = n & -4; + register FLOAT sum1, sum2; + sum1 = 0.0; + sum2 = 0.0; + while (j < n1) { + + sum1 += ABS(x[i]); + sum2 += ABS(x[i + inc_x]); + sum1 += ABS(x[i + 2 * inc_x]); + sum2 += ABS(x[i + 3 * inc_x]); + + i += inc_x * 4; + j += 4; + + } + sumf = sum1 + sum2; + while (j < n) { + + sumf += ABS(x[i]); + i += inc_x; + j++; + } + + + } + return sumf; +} + + diff --git a/kernel/zarch/saxpy.c b/kernel/zarch/saxpy.c new file mode 100644 index 000000000..26ead310c --- /dev/null +++ b/kernel/zarch/saxpy.c @@ -0,0 +1,184 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile( + "vlrepf %%v0,%3 \n\t" + "srlg %%r0,%0,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,0(%%r1,%2) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,32(%%r1,%2) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + + "vfmasb %%v16,%%v0,%%v16,%%v20 \n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21 \n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22 \n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vl %%v25,80(%%r1,%1) \n\t" + "vl %%v26,96(%%r1,%1) \n\t" + "vl %%v27,112(%%r1,%1) \n\t" + "vl %%v28,64(%%r1,%2) \n\t" + "vl %%v29,80(%%r1,%2) \n\t" + "vl %%v30,96(%%r1,%2) \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + + "vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" + "vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" + "vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" + "vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "vl %%v16,128(%%r1,%1) \n\t" + "vl %%v17,144(%%r1,%1) \n\t" + "vl %%v18,160(%%r1,%1) \n\t" + "vl %%v19,176(%%r1,%1) \n\t" + "vl %%v20,128(%%r1,%2) \n\t" + "vl %%v21,144(%%r1,%2) \n\t" + "vl %%v22,160(%%r1,%2) \n\t" + "vl %%v23,176(%%r1,%2) \n\t" + + "vfmasb %%v16,%%v0,%%v16,%%v20 \n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21 \n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22 \n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23 \n\t" + + "vl %%v24,192(%%r1,%1) \n\t" + "vl %%v25,208(%%r1,%1) \n\t" + "vl %%v26,224(%%r1,%1) \n\t" + "vl %%v27,240(%%r1,%1) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" + + "vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" + "vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" + "vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" + "vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" + + "vst %%v16,128(%%r1,%2) \n\t" + "vst %%v17,144(%%r1,%2) \n\t" + "vst %%v18,160(%%r1,%2) \n\t" + "vst %%v19,176(%%r1,%2) \n\t" + "vst %%v20,192(%%r1,%2) \n\t" + "vst %%v21,208(%%r1,%2) \n\t" + "vst %%v22,224(%%r1,%2) \n\t" + "vst %%v23,240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return 0 ; + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -64; + + if ( n1 ) + saxpy_kernel_64(n1, x, y , &da); + + i = n1; + while(i < n) + { + + y[i] += da * x[i] ; + i++ ; + + } + return 0 ; + + + } + + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = da * x[ix] ; + FLOAT m2 = da * x[ix+inc_x] ; + FLOAT m3 = da * x[ix+2*inc_x] ; + FLOAT m4 = da * x[ix+3*inc_x] ; + + y[iy] += m1 ; + y[iy+inc_y] += m2 ; + y[iy+2*inc_y] += m3 ; + y[iy+3*inc_y] += m4 ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + i+=4 ; + + } + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return 0 ; + +} + + diff --git a/kernel/zarch/scopy.c b/kernel/zarch/scopy.c new file mode 100644 index 000000000..ff4227595 --- /dev/null +++ b/kernel/zarch/scopy.c @@ -0,0 +1,85 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "lgr %%r1,%1 \n\t" + "lgr %%r2,%2 \n\t" + "srlg %%r0,%0,6 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1) \n\t" + "pfd 2, 1024(%%r2) \n\t" + "mvc 0(256,%%r2),0(%%r1) \n\t" + "agfi %%r1,256 \n\t" + "agfi %%r2,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y) + :"memory","cc","r0","r1","r2" + ); +} + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + + if (n <= 0) return 0; + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + scopy_kernel_64(n1, x, y); + i = n1; + } + + while (i < n) { + y[i] = x[i]; + i++; + + } + + + } else { + + while (i < n) { + + y[iy] = x[ix]; + ix += inc_x; + iy += inc_y; + i++; + + } + + } + return 0; + + +} diff --git a/kernel/zarch/sdot.c b/kernel/zarch/sdot.c new file mode 100644 index 000000000..fd8c8e445 --- /dev/null +++ b/kernel/zarch/sdot.c @@ -0,0 +1,140 @@ +/*************************************************************************** +Copyright (c) 2013-2018,The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms,with or without +modification,are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice,this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice,this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL +DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + FLOAT dot; + + __asm__ volatile ( + "vzero %%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%3) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,16(%%r1,%3) \n\t" + "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" + "vl %%v27,48(%%r1,%3) \n\t" + "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" + "vl %%v28,64(%%r1,%3) \n\t" + "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" + "vl %%v29,80(%%r1,%3) \n\t" + "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" + "vl %%v30,96(%%r1,%3) \n\t" + "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" + "vl %%v31,112(%%r1,%3) \n\t" + "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vrepf %%v1,%%v0,1 \n\t" + "vrepf %%v2,%%v0,2 \n\t" + "vrepf %%v3,%%v0,3 \n\t" + "aebr %%f0,%%f1 \n\t" + "aebr %%f0,%%f2 \n\t" + "aebr %%f0,%%f3 \n\t" + "ler %0,%%f0 " + :"=f"(dot) + :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return dot; +} + +FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT dot = 0.0 ; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -32; + + if ( n1 ) + dot = sdot_kernel_32(n1,x,y); + + i = n1; + while(i < n) + { + + dot += y[i] * x[i] ; + i++ ; + + } + return(dot); + + + } + + BLASLONG n1 = n & -2; + + while(i < n1) + { + + dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; + ix += inc_x*2 ; + iy += inc_y*2 ; + i+=2 ; + + } + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/zarch/sgemv_n_4.c b/kernel/zarch/sgemv_n_4.c new file mode 100644 index 000000000..92019d732 --- /dev/null +++ b/kernel/zarch/sgemv_n_4.c @@ -0,0 +1,668 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define NBMAX 2048 + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vlrepf %%v0,0(%5) \n\t" + "vlrepf %%v1,4(%5) \n\t" + "vlrepf %%v2,8(%5) \n\t" + "vlrepf %%v3,12(%5) \n\t" + "vlrepf %%v4,%7 \n\t" + "vfmsb %%v0,%%v0,%%v4 \n\t" + "vfmsb %%v1,%%v1,%%v4 \n\t" + "vfmsb %%v2,%%v2,%%v4 \n\t" + "vfmsb %%v3,%%v3,%%v4 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,0(%%r1,%3) \n\t" + "vl %%v19,0(%%r1,%4) \n\t" + "vl %%v20,16(%%r1,%1) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,16(%%r1,%3) \n\t" + "vl %%v23,16(%%r1,%4) \n\t" + "vl %%v24,32(%%r1,%1) \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vl %%v28,48(%%r1,%1) \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vl %%v31,48(%%r1,%4) \n\t" + + "vl %%v4,0(%%r1,%6) \n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,0(%%r1,%6) \n\t" + + "vl %%v4,16(%%r1,%6) \n\t" + "vfmasb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,16(%%r1,%6) \n\t" + + "vl %%v4,32(%%r1,%6) \n\t" + "vfmasb %%v4,%%v24,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v25,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v26,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v27,%%v3,%%v4 \n\t" + "vst %%v4,32(%%r1,%6) \n\t" + + "vl %%v4,48(%%r1,%6) \n\t" + "vfmasb %%v4,%%v28,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v29,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v30,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v31,%%v3,%%v4 \n\t" + "vst %%v4,48(%%r1,%6) \n\t" + + "vl %%v16,64(%%r1,%1) \n\t" + "vl %%v17,64(%%r1,%2) \n\t" + "vl %%v18,64(%%r1,%3) \n\t" + "vl %%v19,64(%%r1,%4) \n\t" + "vl %%v20,80(%%r1,%1) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,80(%%r1,%3) \n\t" + "vl %%v23,80(%%r1,%4) \n\t" + "vl %%v24,96(%%r1,%1) \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vl %%v28,112(%%r1,%1) \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vl %%v31,112(%%r1,%4) \n\t" + + "vl %%v4,64(%%r1,%6) \n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,64(%%r1,%6) \n\t" + + "vl %%v4,80(%%r1,%6) \n\t" + "vfmasb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,80(%%r1,%6) \n\t" + + "vl %%v4,96(%%r1,%6) \n\t" + "vfmasb %%v4,%%v24,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v25,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v26,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v27,%%v3,%%v4 \n\t" + "vst %%v4,96(%%r1,%6) \n\t" + + "vl %%v4,112(%%r1,%6) \n\t" + "vfmasb %%v4,%%v28,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v29,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v30,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v31,%%v3,%%v4 \n\t" + "vst %%v4,112(%%r1,%6) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,0(%%r1,%3) \n\t" + "vl %%v19,0(%%r1,%4) \n\t" + + "vl %%v4,0(%%r1,%6) \n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,0(%%r1,%6) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vlrepf %%v0,0(%3) \n\t" + "vlrepf %%v1,4(%3) \n\t" + "vlrepf %%v2,%5 \n\t" + "vfmsb %%v0,%%v0,%%v2 \n\t" + "vfmsb %%v1,%%v1,%%v2 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,16(%%r1,%1) \n\t" + "vl %%v19,16(%%r1,%2) \n\t" + "vl %%v20,32(%%r1,%1) \n\t" + "vl %%v21,32(%%r1,%2) \n\t" + "vl %%v22,48(%%r1,%1) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "vl %%v24,64(%%r1,%1) \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vl %%v26,80(%%r1,%1) \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vl %%v28,96(%%r1,%1) \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vl %%v30,112(%%r1,%1) \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + + "vl %%v2,0(%%r1,%4) \n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2 \n\t" + "vst %%v2,0(%%r1,%4) \n\t" + + "vl %%v2,16(%%r1,%4) \n\t" + "vfmasb %%v2,%%v18,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v19,%%v1,%%v2 \n\t" + "vst %%v2,16(%%r1,%4) \n\t" + + "vl %%v2,32(%%r1,%4) \n\t" + "vfmasb %%v2,%%v20,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v21,%%v1,%%v2 \n\t" + "vst %%v2,32(%%r1,%4) \n\t" + + "vl %%v2,48(%%r1,%4) \n\t" + "vfmasb %%v2,%%v22,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v23,%%v1,%%v2 \n\t" + "vst %%v2,48(%%r1,%4) \n\t" + + "vl %%v2,64(%%r1,%4) \n\t" + "vfmasb %%v2,%%v24,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v25,%%v1,%%v2 \n\t" + "vst %%v2,64(%%r1,%4) \n\t" + + "vl %%v2,80(%%r1,%4) \n\t" + "vfmasb %%v2,%%v26,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v27,%%v1,%%v2 \n\t" + "vst %%v2,80(%%r1,%4) \n\t" + + "vl %%v2,96(%%r1,%4) \n\t" + "vfmasb %%v2,%%v28,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v29,%%v1,%%v2 \n\t" + "vst %%v2,96(%%r1,%4) \n\t" + + "vl %%v2,112(%%r1,%4) \n\t" + "vfmasb %%v2,%%v30,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v31,%%v1,%%v2 \n\t" + "vst %%v2,112(%%r1,%4) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + + "vl %%v2,0(%%r1,%4) \n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2 \n\t" + "vst %%v2,0(%%r1,%4) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vlrepf %%v0,0(%2) \n\t" + "vlrepf %%v1,%4 \n\t" + "vfmsb %%v0,%%v0,%%v1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,64(%%r1,%1) \n\t" + "vl %%v21,80(%%r1,%1) \n\t" + "vl %%v22,96(%%r1,%1) \n\t" + "vl %%v23,112(%%r1,%1) \n\t" + + "vl %%v1,0(%%r1,%3) \n\t" + "vfmasb %%v1,%%v16,%%v0,%%v1 \n\t" + "vst %%v1,0(%%r1,%3) \n\t" + + "vl %%v1,16(%%r1,%3) \n\t" + "vfmasb %%v1,%%v17,%%v0,%%v1 \n\t" + "vst %%v1,16(%%r1,%3) \n\t" + + "vl %%v1,32(%%r1,%3) \n\t" + "vfmasb %%v1,%%v18,%%v0,%%v1 \n\t" + "vst %%v1,32(%%r1,%3) \n\t" + + "vl %%v1,48(%%r1,%3) \n\t" + "vfmasb %%v1,%%v19,%%v0,%%v1 \n\t" + "vst %%v1,48(%%r1,%3) \n\t" + + "vl %%v1,64(%%r1,%3) \n\t" + "vfmasb %%v1,%%v20,%%v0,%%v1 \n\t" + "vst %%v1,64(%%r1,%3) \n\t" + + "vl %%v1,80(%%r1,%3) \n\t" + "vfmasb %%v1,%%v21,%%v0,%%v1 \n\t" + "vst %%v1,80(%%r1,%3) \n\t" + + "vl %%v1,96(%%r1,%3) \n\t" + "vfmasb %%v1,%%v22,%%v0,%%v1 \n\t" + "vst %%v1,96(%%r1,%3) \n\t" + + "vl %%v1,112(%%r1,%3) \n\t" + "vfmasb %%v1,%%v23,%%v0,%%v1 \n\t" + "vst %%v1,112(%%r1,%3) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + + "vl %%v1,0(%%r1,%3) \n\t" + "vfmasb %%v1,%%v16,%%v0,%%v1 \n\t" + "vst %%v1,0(%%r1,%3) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + for (i = 0; i < n; i++) + { + *dest += src[i]; + dest += inc_dest; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[4]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4 = lda << 2; + FLOAT xbuffer[8],*ybuffer; + + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + + ybuffer = buffer; + + n1 = n >> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*8); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); + } + + + return(0); +} + + diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c new file mode 100644 index 000000000..efc06297f --- /dev/null +++ b/kernel/zarch/sgemv_t_4.c @@ -0,0 +1,826 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define NBMAX 2048 + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 1,1024(%%r1,%5) \n\t" + + "vl %%v16,0(%%r1,%5) \n\t" + "vl %%v17,16(%%r1,%5) \n\t" + "vl %%v18,32(%%r1,%5) \n\t" + "vl %%v19,48(%%r1,%5) \n\t" + "vl %%v20,64(%%r1,%5) \n\t" + "vl %%v21,80(%%r1,%5) \n\t" + "vl %%v22,96(%%r1,%5) \n\t" + "vl %%v23,112(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" + + "vl %%v28,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v28,%%v0 \n\t" + "vl %%v29,16(%%r1,%2) \n\t" + "vfmasb %%v1,%%v17,%%v29,%%v1 \n\t" + "vl %%v30,16(%%r1,%3) \n\t" + "vfmasb %%v2,%%v17,%%v30,%%v2 \n\t" + "vl %%v31,16(%%r1,%4) \n\t" + "vfmasb %%v3,%%v17,%%v31,%%v3 \n\t" + + "vl %%v24,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v24,%%v0 \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vfmasb %%v1,%%v18,%%v25,%%v1 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2 \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vfmasb %%v3,%%v18,%%v27,%%v3 \n\t" + + "vl %%v28,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v28,%%v0 \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vfmasb %%v1,%%v19,%%v29,%%v1 \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vfmasb %%v2,%%v19,%%v30,%%v2 \n\t" + "vl %%v31,48(%%r1,%4) \n\t" + "vfmasb %%v3,%%v19,%%v31,%%v3 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" + "vl %%v26,64(%%r1,%3) \n\t" + "vfmasb %%v2,%%v20,%%v26,%%v2 \n\t" + "vl %%v27,64(%%r1,%4) \n\t" + "vfmasb %%v3,%%v20,%%v27,%%v3 \n\t" + + "vl %%v28,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v28,%%v0 \n\t" + "vl %%v29,80(%%r1,%2) \n\t" + "vfmasb %%v1,%%v21,%%v29,%%v1 \n\t" + "vl %%v30,80(%%r1,%3) \n\t" + "vfmasb %%v2,%%v21,%%v30,%%v2 \n\t" + "vl %%v31,80(%%r1,%4) \n\t" + "vfmasb %%v3,%%v21,%%v31,%%v3 \n\t" + + "vl %%v24,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v24,%%v0 \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vfmasb %%v1,%%v22,%%v25,%%v1 \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vfmasb %%v2,%%v22,%%v26,%%v2 \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vfmasb %%v3,%%v22,%%v27,%%v3 \n\t" + + "vl %%v28,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v28,%%v0 \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vfmasb %%v1,%%v23,%%v29,%%v1 \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vfmasb %%v2,%%v23,%%v30,%%v2 \n\t" + "vl %%v31,112(%%r1,%4) \n\t" + "vfmasb %%v3,%%v23,%%v31,%%v3 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vrepf %%v4,%%v0,1 \n\t" + "aebr %%f0,%%f4 \n\t" + "vrepf %%v4,%%v0,2 \n\t" + "aebr %%f0,%%f4 \n\t" + "vrepf %%v4,%%v0,3 \n\t" + "aebr %%f0,%%f4 \n\t" + "ste %%f0,0(%6) \n\t" + "vrepf %%v4,%%v1,1 \n\t" + "aebr %%f1,%%f4 \n\t" + "vrepf %%v4,%%v1,2 \n\t" + "aebr %%f1,%%f4 \n\t" + "vrepf %%v4,%%v1,3 \n\t" + "aebr %%f1,%%f4 \n\t" + "ste %%f1,4(%6) \n\t" + "vrepf %%v4,%%v2,1 \n\t" + "aebr %%f2,%%f4 \n\t" + "vrepf %%v4,%%v2,2 \n\t" + "aebr %%f2,%%f4 \n\t" + "vrepf %%v4,%%v2,3 \n\t" + "aebr %%f2,%%f4 \n\t" + "ste %%f2,8(%6) \n\t" + "vrepf %%v4,%%v3,1 \n\t" + "aebr %%f3,%%f4 \n\t" + "vrepf %%v4,%%v3,2 \n\t" + "aebr %%f3,%%f4 \n\t" + "vrepf %%v4,%%v3,3 \n\t" + "aebr %%f3,%%f4 \n\t" + "ste %%f3,12(%6) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + + "vl %%v26,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v26,%%v0 \n\t" + "vl %%v27,16(%%r1,%2) \n\t" + "vfmasb %%v1,%%v17,%%v27,%%v1 \n\t" + + "vl %%v28,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v28,%%v0 \n\t" + "vl %%v29,32(%%r1,%2) \n\t" + "vfmasb %%v1,%%v18,%%v29,%%v1 \n\t" + + "vl %%v30,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v30,%%v0 \n\t" + "vl %%v31,48(%%r1,%2) \n\t" + "vfmasb %%v1,%%v19,%%v31,%%v1 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" + + "vl %%v26,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v26,%%v0 \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vfmasb %%v1,%%v21,%%v27,%%v1 \n\t" + + "vl %%v28,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v28,%%v0 \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vfmasb %%v1,%%v22,%%v29,%%v1 \n\t" + + "vl %%v30,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v30,%%v0 \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + "vfmasb %%v1,%%v23,%%v31,%%v1 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepf %%v2,%%v0,1 \n\t" + "aebr %%f0,%%f2 \n\t" + "vrepf %%v2,%%v0,2 \n\t" + "aebr %%f0,%%f2 \n\t" + "vrepf %%v2,%%v0,3 \n\t" + "aebr %%f0,%%f2 \n\t" + "ste %%f0,0(%4) \n\t" + "vrepf %%v2,%%v1,1 \n\t" + "aebr %%f1,%%f2 \n\t" + "vrepf %%v2,%%v1,2 \n\t" + "aebr %%f1,%%f2 \n\t" + "vrepf %%v2,%%v1,3 \n\t" + "aebr %%f1,%%f2 \n\t" + "ste %%f1,4(%4) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + + "vl %%v25,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" + + "vl %%v26,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" + + "vl %%v27,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" + + "vl %%v28,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" + + "vl %%v29,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" + + "vl %%v30,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" + + "vl %%v31,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepf %%v1,%%v0,1 \n\t" + "aebr %%f0,%%f1 \n\t" + "vrepf %%v1,%%v0,2 \n\t" + "aebr %%f0,%%f1 \n\t" + "vrepf %%v1,%%v0,3 \n\t" + "aebr %%f0,%%f1 \n\t" + "ste %%f0,0(%3) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for (i = 0; i < n; i++) + { + dest[i] = *src; + src += inc_src; + } +} + +static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) +{ + __asm__ volatile ( + "vlrepf %%v0,%1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24, 0(%%r1,%3) \n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vst %%v24, 0(%%r1,%3) \n\t" + "vl %%v25, 16(%%r1,%3) \n\t" + "vfmasb %%v25,%%v17,%%v0,%%v25 \n\t" + "vst %%v25, 16(%%r1,%3) \n\t" + "vl %%v26, 32(%%r1,%3) \n\t" + "vfmasb %%v26,%%v18,%%v0,%%v26 \n\t" + "vst %%v26, 32(%%r1,%3) \n\t" + "vl %%v27, 48(%%r1,%3) \n\t" + "vfmasb %%v27,%%v19,%%v0,%%v27 \n\t" + "vst %%v27, 48(%%r1,%3) \n\t" + "vl %%v28, 64(%%r1,%3) \n\t" + "vfmasb %%v28,%%v20,%%v0,%%v28 \n\t" + "vst %%v28, 64(%%r1,%3) \n\t" + "vl %%v29, 80(%%r1,%3) \n\t" + "vfmasb %%v29,%%v21,%%v0,%%v29 \n\t" + "vst %%v29, 80(%%r1,%3) \n\t" + "vl %%v30, 96(%%r1,%3) \n\t" + "vfmasb %%v30,%%v22,%%v0,%%v30 \n\t" + "vst %%v30, 96(%%r1,%3) \n\t" + "vl %%v31, 112(%%r1,%3) \n\t" + "vfmasb %%v31,%%v23,%%v0,%%v31 \n\t" + "vst %%v31, 112(%%r1,%3) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + + "vl %%v24, 0(%%r1,%3) \n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vst %%v24, 0(%%r1,%3) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} +static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + if (inc_dest == 1) + add_y_kernel_4(n, da, src, dest); + else + { + BLASLONG i; + for (i = 0; i < n; i++) + { + *dest += src[i] * da; + dest += inc_dest; + } + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG register i; + BLASLONG register j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + BLASLONG n0; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT ybuffer[2] __attribute__ ((aligned(16))); + FLOAT *xbuffer; + FLOAT *ytemp; + + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + + xbuffer = buffer; + ytemp = buffer + (m < NBMAX ? m : NBMAX); + + n0 = n / NBMAX; + n1 = (n % NBMAX) >> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if ( inc_x == 1 ) + xbuffer = x_ptr; + else + copy_x(NB,x_ptr,xbuffer,inc_x); + + + FLOAT *ap[4]; + FLOAT *yp; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( n0 > 0 ) + { + BLASLONG nb1 = NBMAX / 4; + for( j=0; j 0 ) + { + add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4 ; + } + + if ( n2 & 2 ) + { + + sgemv_kernel_4x2(NB,ap,xbuffer,ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; + + } + + if ( n2 & 1 ) + { + + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); + a_ptr += lda; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + + } + a += NB; + x += NB * inc_x; + } + + if ( m3 == 0 ) return(0); + + x_ptr = x; + a_ptr = a; + if ( m3 == 3 ) + { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if ( lda == 3 && inc_y == 1 ) + { + + for ( j=0; j< ( n & -4) ; j+=4 ) + { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for ( ; j 0) { + + maxf = smax_kernel_64(n1, x); + + i = n1; + } + else + { + maxf=x[0]; + i++; + } + + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { + + maxf=x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c new file mode 100644 index 000000000..e882b7ff1 --- /dev/null +++ b/kernel/zarch/smin.c @@ -0,0 +1,186 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) +{ + FLOAT min; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "srlg %%r0,%1,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vfchsb %%v26,%%v21,%%v20 \n\t" + "vfchsb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v25,%%v24 \n\t" + "vfchsb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vfchsb %%v26,%%v21,%%v20 \n\t" + "vfchsb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v25,%%v24 \n\t" + "vfchsb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return min; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + minf = smin_kernel_64(n1, x); + + i = n1; + } + else + { + minf=x[0]; + i++; + } + + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { + + minf=x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/srot.c b/kernel/zarch/srot.c new file mode 100644 index 000000000..763cc664a --- /dev/null +++ b/kernel/zarch/srot.c @@ -0,0 +1,246 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) +{ + __asm__ ( + "vlrepf %%v0,%3 \n\t" + "vlrepf %%v1,%4 \n\t" + "srlg %%r0,%0,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%1) \n\t" + "vst %%v29, 16(%%r1,%1) \n\t" + "vst %%v30, 32(%%r1,%1) \n\t" + "vst %%v31, 48(%%r1,%1) \n\t" + "vst %%v20, 0(%%r1,%2) \n\t" + "vst %%v21, 16(%%r1,%2) \n\t" + "vst %%v22, 32(%%r1,%2) \n\t" + "vst %%v23, 48(%%r1,%2) \n\t" + + "vl %%v24, 64(%%r1,%1) \n\t" + "vl %%v25, 80(%%r1,%1) \n\t" + "vl %%v26, 96(%%r1,%1) \n\t" + "vl %%v27, 112(%%r1,%1) \n\t" + "vl %%v16, 64(%%r1,%2) \n\t" + "vl %%v17, 80(%%r1,%2) \n\t" + "vl %%v18, 96(%%r1,%2) \n\t" + "vl %%v19, 112(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%1) \n\t" + "vst %%v29, 80(%%r1,%1) \n\t" + "vst %%v30, 96(%%r1,%1) \n\t" + "vst %%v31, 112(%%r1,%1) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%1) \n\t" + "vst %%v29, 144(%%r1,%1) \n\t" + "vst %%v30, 160(%%r1,%1) \n\t" + "vst %%v31, 176(%%r1,%1) \n\t" + "vst %%v20, 128(%%r1,%2) \n\t" + "vst %%v21, 144(%%r1,%2) \n\t" + "vst %%v22, 160(%%r1,%2) \n\t" + "vst %%v23, 176(%%r1,%2) \n\t" + + "vl %%v24, 192(%%r1,%1) \n\t" + "vl %%v25, 208(%%r1,%1) \n\t" + "vl %%v26, 224(%%r1,%1) \n\t" + "vl %%v27, 240(%%r1,%1) \n\t" + "vl %%v16, 192(%%r1,%2) \n\t" + "vl %%v17, 208(%%r1,%2) \n\t" + "vl %%v18, 224(%%r1,%2) \n\t" + "vl %%v19, 240(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 192(%%r1,%1) \n\t" + "vst %%v29, 208(%%r1,%1) \n\t" + "vst %%v30, 224(%%r1,%1) \n\t" + "vst %%v31, 240(%%r1,%1) \n\t" + "vst %%v20, 192(%%r1,%2) \n\t" + "vst %%v21, 208(%%r1,%2) \n\t" + "vst %%v22, 224(%%r1,%2) \n\t" + "vst %%v23, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT temp; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -64; + if ( n1 > 0 ) + { + FLOAT cosa,sina; + cosa=c; + sina=s; + srot_kernel_64(n1, x, y, &cosa, &sina); + i=n1; + } + + while(i < n) + { + temp = c*x[i] + s*y[i] ; + y[i] = c*y[i] - s*x[i] ; + x[i] = temp ; + + i++ ; + + } + + + } + else + { + + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + +} + + diff --git a/kernel/zarch/sscal.c b/kernel/zarch/sscal.c new file mode 100644 index 000000000..c18a7e56f --- /dev/null +++ b/kernel/zarch/sscal.c @@ -0,0 +1,201 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) +{ + __asm__ volatile ( + "vlrepf %%v0,%1 \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%2) \n\t" + "vfmsb %%v24,%%v24,%%v0 \n\t" + "vst %%v24, 0(%%r1,%2) \n\t" + "vl %%v25, 16(%%r1,%2) \n\t" + "vfmsb %%v25,%%v25,%%v0 \n\t" + "vst %%v25, 16(%%r1,%2) \n\t" + "vl %%v26, 32(%%r1,%2) \n\t" + "vfmsb %%v26,%%v26,%%v0 \n\t" + "vst %%v26, 32(%%r1,%2) \n\t" + "vl %%v27, 48(%%r1,%2) \n\t" + "vfmsb %%v27,%%v27,%%v0 \n\t" + "vst %%v27, 48(%%r1,%2) \n\t" + "vl %%v24, 64(%%r1,%2) \n\t" + "vfmsb %%v24,%%v24,%%v0 \n\t" + "vst %%v24, 64(%%r1,%2) \n\t" + "vl %%v25, 80(%%r1,%2) \n\t" + "vfmsb %%v25,%%v25,%%v0 \n\t" + "vst %%v25, 80(%%r1,%2) \n\t" + "vl %%v26, 96(%%r1,%2) \n\t" + "vfmsb %%v26,%%v26,%%v0 \n\t" + "vst %%v26, 96(%%r1,%2) \n\t" + "vl %%v27, 112(%%r1,%2) \n\t" + "vfmsb %%v27,%%v27,%%v0 \n\t" + "vst %%v27, 112(%%r1,%2) \n\t" + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v24","v25","v26","v27" + ); +} + +static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) +{ + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + + "vst %%v24,0(%%r1,%1) \n\t" + "vst %%v25,16(%%r1,%1) \n\t" + "vst %%v26,32(%%r1,%1) \n\t" + "vst %%v27,48(%%r1,%1) \n\t" + "vst %%v24,64(%%r1,%1) \n\t" + "vst %%v25,80(%%r1,%1) \n\t" + "vst %%v26,96(%%r1,%1) \n\t" + "vst %%v27,112(%%r1,%1) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x) + :"memory","cc","r0","r1","v24","v25","v26","v27" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0,j=0; + if ( n <= 0 || inc_x <=0 ) + return(0); + + + if ( inc_x == 1 ) + { + + if ( da == 0.0 ) + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + + sscal_kernel_32_zero(n1, x); + j=n1; + } + + while(j < n) + { + + x[j]=0.0; + j++; + } + + } + else + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + sscal_kernel_32(n1, da, x); + j=n1; + } + while(j < n) + { + + x[j] = da * x[j] ; + j++; + } + } + + + } + else + { + + if ( da == 0.0 ) + { + + BLASLONG n1 = n & -2; + + while (j < n1) { + + x[i]=0.0; + x[i + inc_x]=0.0; + + i += inc_x * 2; + j += 2; + + } + while(j < n) + { + + x[i]=0.0; + i += inc_x ; + j++; + } + + } + else + { + BLASLONG n1 = n & -2; + + while (j < n1) { + + x[i] = da * x[i] ; + x[i + inc_x] = da * x[i + inc_x]; + + i += inc_x * 2; + j += 2; + + } + + while(j < n) + { + + x[i] = da * x[i] ; + i += inc_x ; + j++; + } + } + + } + return 0; + +} + + diff --git a/kernel/zarch/sswap.c b/kernel/zarch/sswap.c new file mode 100644 index 000000000..d0c0dc3f4 --- /dev/null +++ b/kernel/zarch/sswap.c @@ -0,0 +1,164 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile( + "srlg %%r0,%0,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" + + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vl %%v4, 64(%%r1,%2) \n\t" + "vl %%v5, 80(%%r1,%2) \n\t" + "vl %%v6, 96(%%r1,%2) \n\t" + "vl %%v7, 112(%%r1,%2) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v1, 16(%%r1,%1) \n\t" + "vst %%v2, 32(%%r1,%1) \n\t" + "vst %%v3, 48(%%r1,%1) \n\t" + "vst %%v4, 64(%%r1,%1) \n\t" + "vst %%v5, 80(%%r1,%1) \n\t" + "vst %%v6, 96(%%r1,%1) \n\t" + "vst %%v7, 112(%%r1,%1) \n\t" + + "vl %%v0, 128(%%r1,%2) \n\t" + "vl %%v1, 144(%%r1,%2) \n\t" + "vl %%v2, 160(%%r1,%2) \n\t" + "vl %%v3, 176(%%r1,%2) \n\t" + "vl %%v4, 192(%%r1,%2) \n\t" + "vl %%v5, 208(%%r1,%2) \n\t" + "vl %%v6, 224(%%r1,%2) \n\t" + "vl %%v7, 240(%%r1,%2) \n\t" + "vst %%v0, 128(%%r1,%1) \n\t" + "vst %%v1, 144(%%r1,%1) \n\t" + "vst %%v2, 160(%%r1,%1) \n\t" + "vst %%v3, 176(%%r1,%1) \n\t" + "vst %%v4, 192(%%r1,%1) \n\t" + "vst %%v5, 208(%%r1,%1) \n\t" + "vst %%v6, 224(%%r1,%1) \n\t" + "vst %%v7, 240(%%r1,%1) \n\t" + + "vst %%v16, 0(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1 )) + { + + BLASLONG n1 = n & -64; + if ( n1 > 0 ) + { + sswap_kernel_64(n1, x, y); + i=n1; + } + + while(i < n) + { + temp = y[i]; + y[i] = x[i] ; + x[i] = temp; + i++ ; + + } + + + } + else + { + + while(i < n) + { + temp = y[iy]; + y[iy] = x[ix] ; + x[ix] = temp; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c new file mode 100644 index 000000000..6393b099b --- /dev/null +++ b/kernel/zarch/zamax.c @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vleg %%v0,0(%2),0 \n\t" + "vleg %%v16,8(%2),0 \n\t" + "vleg %%v0,16(%2),1 \n\t" + "vleg %%v16,24(%2),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v16,%%v16 \n\t" + "vfadb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vleg %%v16,0(%%r1,%2),0 \n\t" + "vleg %%v17,8(%%r1,%2),0 \n\t" + "vleg %%v16,16(%%r1,%2),1 \n\t" + "vleg %%v17,24(%%r1,%2),1 \n\t" + "vleg %%v18,32(%%r1,%2),0 \n\t" + "vleg %%v19,40(%%r1,%2),0 \n\t" + "vleg %%v18,48(%%r1,%2),1 \n\t" + "vleg %%v19,56(%%r1,%2),1 \n\t" + "vleg %%v20,64(%%r1,%2),0 \n\t" + "vleg %%v21,72(%%r1,%2),0 \n\t" + "vleg %%v20,80(%%r1,%2),1 \n\t" + "vleg %%v21,88(%%r1,%2),1 \n\t" + "vleg %%v22,96(%%r1,%2),0 \n\t" + "vleg %%v23,104(%%r1,%2),0 \n\t" + "vleg %%v22,112(%%r1,%2),1 \n\t" + "vleg %%v23,120(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vleg %%v16,128(%%r1,%2),0 \n\t" + "vleg %%v17,136(%%r1,%2),0 \n\t" + "vleg %%v16,144(%%r1,%2),1 \n\t" + "vleg %%v17,152(%%r1,%2),1 \n\t" + "vleg %%v18,160(%%r1,%2),0 \n\t" + "vleg %%v19,168(%%r1,%2),0 \n\t" + "vleg %%v18,176(%%r1,%2),1 \n\t" + "vleg %%v19,184(%%r1,%2),1 \n\t" + "vleg %%v20,192(%%r1,%2),0 \n\t" + "vleg %%v21,200(%%r1,%2),0 \n\t" + "vleg %%v20,208(%%r1,%2),1 \n\t" + "vleg %%v21,216(%%r1,%2),1 \n\t" + "vleg %%v22,224(%%r1,%2),0 \n\t" + "vleg %%v23,232(%%r1,%2),0 \n\t" + "vleg %%v22,240(%%r1,%2),1 \n\t" + "vleg %%v23,248(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + maxf = zamax_kernel_16(n1, x); + + i = n1; + } + else + { + maxf=CABS1(x,0); + i++; + } + + while (i < n) { + if (ABS(x[i*2]) > maxf) { + maxf = ABS(x[i*2]); + } + i++; + } + return (maxf); + + } else { + + inc_x2 = 2 * inc_x; + maxf=CABS1(x,0); + i += inc_x2; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (CABS1(x,i) > maxf) { + maxf = CABS1(x,i); + } + if (CABS1(x,i+inc_x2) > maxf) { + maxf = CABS1(x,i+inc_x2); + } + if (CABS1(x,i+inc_x2*2) > maxf) { + maxf = CABS1(x,i+inc_x2*2); + } + if (CABS1(x,i+inc_x2*3) > maxf) { + maxf = CABS1(x,i+inc_x2*3); + } + + i += inc_x2 * 4; + + j += 4; + + } + + + while (j < n) { + if (CABS1(x,i) > maxf) { + maxf = CABS1(x,i); + } + i += inc_x2; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c new file mode 100644 index 000000000..b15774bb9 --- /dev/null +++ b/kernel/zarch/zamin.c @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vleg %%v0,0(%2),0 \n\t" + "vleg %%v16,8(%2),0 \n\t" + "vleg %%v0,16(%2),1 \n\t" + "vleg %%v16,24(%2),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v16,%%v16 \n\t" + "vfadb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vleg %%v16,0(%%r1,%2),0 \n\t" + "vleg %%v17,8(%%r1,%2),0 \n\t" + "vleg %%v16,16(%%r1,%2),1 \n\t" + "vleg %%v17,24(%%r1,%2),1 \n\t" + "vleg %%v18,32(%%r1,%2),0 \n\t" + "vleg %%v19,40(%%r1,%2),0 \n\t" + "vleg %%v18,48(%%r1,%2),1 \n\t" + "vleg %%v19,56(%%r1,%2),1 \n\t" + "vleg %%v20,64(%%r1,%2),0 \n\t" + "vleg %%v21,72(%%r1,%2),0 \n\t" + "vleg %%v20,80(%%r1,%2),1 \n\t" + "vleg %%v21,88(%%r1,%2),1 \n\t" + "vleg %%v22,96(%%r1,%2),0 \n\t" + "vleg %%v23,104(%%r1,%2),0 \n\t" + "vleg %%v22,112(%%r1,%2),1 \n\t" + "vleg %%v23,120(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vleg %%v16,128(%%r1,%2),0 \n\t" + "vleg %%v17,136(%%r1,%2),0 \n\t" + "vleg %%v16,144(%%r1,%2),1 \n\t" + "vleg %%v17,152(%%r1,%2),1 \n\t" + "vleg %%v18,160(%%r1,%2),0 \n\t" + "vleg %%v19,168(%%r1,%2),0 \n\t" + "vleg %%v18,176(%%r1,%2),1 \n\t" + "vleg %%v19,184(%%r1,%2),1 \n\t" + "vleg %%v20,192(%%r1,%2),0 \n\t" + "vleg %%v21,200(%%r1,%2),0 \n\t" + "vleg %%v20,208(%%r1,%2),1 \n\t" + "vleg %%v21,216(%%r1,%2),1 \n\t" + "vleg %%v22,224(%%r1,%2),0 \n\t" + "vleg %%v23,232(%%r1,%2),0 \n\t" + "vleg %%v22,240(%%r1,%2),1 \n\t" + "vleg %%v23,248(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + minf = zamin_kernel_16(n1, x); + + i = n1; + } + else + { + minf=CABS1(x,0); + i++; + } + + while (i < n) { + if (ABS(x[i*2]) < minf) { + minf = ABS(x[i*2]); + } + i++; + } + return (minf); + + } else { + + inc_x2 = 2 * inc_x; + minf=CABS1(x,0); + i += inc_x2; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (CABS1(x,i) < minf) { + minf = CABS1(x,i); + } + if (CABS1(x,i+inc_x2) < minf) { + minf = CABS1(x,i+inc_x2); + } + if (CABS1(x,i+inc_x2*2) < minf) { + minf = CABS1(x,i+inc_x2*2); + } + if (CABS1(x,i+inc_x2*3) < minf) { + minf = CABS1(x,i+inc_x2*3); + } + + i += inc_x2 * 4; + + j += 4; + + } + + + while (j < n) { + if (CABS1(x,i) < minf) { + minf = CABS1(x,i); + } + i += inc_x2; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index 0fc5c9ecb..8faaf20eb 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -25,92 +25,98 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" #include #if defined(DOUBLE) - #define ABS fabs - #else - #define ABS fabsf - #endif - -static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { - +static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) +{ FLOAT asum; + __asm__ ( - "pfd 1, 0(%[ptr_x]) \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v22 \n\t" - "vzero %%v23 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" - "vlm %%v24,%%v31,0(%[ptr_tmp]) \n\t" - - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v0,%%v0,%%v24 \n\t" - "vfadb %%v1,%%v1,%%v25 \n\t" - "vfadb %%v23,%%v23,%%v26 \n\t" - "vfadb %%v22,%%v22,%%v27 \n\t" - "vfadb %%v0,%%v0,%%v28 \n\t" - "vfadb %%v1,%%v1,%%v29 \n\t" - "vfadb %%v23,%%v23,%%v30 \n\t" - "vfadb %%v22,%%v22,%%v31 \n\t" - - "vlm %%v24,%%v31, 128(%[ptr_tmp]) \n\t" - - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - "vfadb %%v0,%%v0,%%v24 \n\t" - "vfadb %%v1,%%v1,%%v25 \n\t" - "vfadb %%v23,%%v23,%%v26 \n\t" - "vfadb %%v22,%%v22,%%v27 \n\t" - "vfadb %%v0,%%v0,%%v28 \n\t" - "vfadb %%v1,%%v1,%%v29 \n\t" - "vfadb %%v23,%%v23,%%v30 \n\t" - "vfadb %%v22,%%v22,%%v31 \n\t" - - "clgrjl %[ptr_tmp],%%r0,1b \n\t" - "vfadb %%v24,%%v0,%%v1 \n\t" - "vfadb %%v25,%%v23,%%v22 \n\t" - "vfadb %%v0,%%v25,%%v24 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %[asum] ,%%f0" - : [asum] "=f"(asum),[ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[2*n])x ), [n] "r"(n), [ptr_x] "a"(x) - : "cc", "r0","f0","f1","v0","v1","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + "vl %%v20, 64(%%r1,%2) \n\t" + "vl %%v21, 80(%%r1,%2) \n\t" + "vl %%v22, 96(%%r1,%2) \n\t" + "vl %%v23, 112(%%r1,%2) \n\t" + + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfadb %%v0,%%v0,%%v16 \n\t" + "vfadb %%v1,%%v1,%%v17 \n\t" + "vfadb %%v2,%%v2,%%v18 \n\t" + "vfadb %%v3,%%v3,%%v19 \n\t" + "vfadb %%v0,%%v0,%%v20 \n\t" + "vfadb %%v1,%%v1,%%v21 \n\t" + "vfadb %%v2,%%v2,%%v22 \n\t" + "vfadb %%v3,%%v3,%%v23 \n\t" + + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + "vl %%v20, 192(%%r1,%2) \n\t" + "vl %%v21, 208(%%r1,%2) \n\t" + "vl %%v22, 224(%%r1,%2) \n\t" + "vl %%v23, 240(%%r1,%2) \n\t" + + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfadb %%v0,%%v0,%%v16 \n\t" + "vfadb %%v1,%%v1,%%v17 \n\t" + "vfadb %%v2,%%v2,%%v18 \n\t" + "vfadb %%v3,%%v3,%%v19 \n\t" + "vfadb %%v0,%%v0,%%v20 \n\t" + "vfadb %%v1,%%v1,%%v21 \n\t" + "vfadb %%v2,%%v2,%%v22 \n\t" + "vfadb %%v3,%%v3,%%v23 \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b \n\t" + "vfadb %%v0,%%v0,%%v1 \n\t" + "vfadb %%v0,%%v0,%%v2 \n\t" + "vfadb %%v0,%%v0,%%v3 \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "ldr %0,%%f0 " + :"=f"(asum) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" + ); + return asum; - } - - FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; @@ -128,7 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( n1 > 0 ) { - sumf=zasum_kernel_16(n1, x ); + sumf = zasum_kernel_16(n1, x); i=n1; ip=2*n1; } diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 212de25c8..6ba44a27c 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -23,142 +23,98 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - +*****************************************************************************/ #include "common.h" - -static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) { - - BLASLONG tempR1 ; - __asm__ ("pfd 1, 0(%[x_tmp]) \n\t" - "pfd 2, 0(%[y_tmp]) \n\t" +static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile( #if !defined(CONJ) - "lgdr %[t1],%[alpha_r] \n\t" - "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint - "lgdr %[t1],%[alpha_i] \n\t" - "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint - "vflcdb %%v29,%%v29 \n\t" //complement both - "vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i} + "vlrepg %%v0,0(%3) \n\t" + "vleg %%v1,8(%3),0 \n\t" + "wflcdb %%v1,%%v1 \n\t" + "vleg %%v1,8(%3),1 \n\t" +#else + "vleg %%v0,0(%3),1 \n\t" + "vflcdb %%v0,%%v0 \n\t" + "vleg %%v0,0(%3),0 \n\t" + "vlrepg %%v1,8(%3) \n\t" +#endif + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" -#else - "lgdr %[t1],%[alpha_i] \n\t" - "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint - "lgdr %[t1],%[alpha_r] \n\t" - "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint - "vflcdb %%v28,%%v28 \n\t" //complement both - "vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r} -#endif - - "xgr %[t1],%[t1] \n\t" - "sllg %[tmp],%[tmp],4 \n\t" - "vl %%v30 , 0(%[t1],%[y_tmp]) \n\t" - "vl %%v31 , 16(%[t1],%[y_tmp]) \n\t" - "vl %%v6 , 32(%[t1],%[y_tmp]) \n\t" - "vl %%v7 , 48(%[t1],%[y_tmp]) \n\t" - "vl %%v20 , 0(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 16(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 32(%[t1],%[x_tmp]) \n\t" - "vl %%v23 , 48(%[t1],%[x_tmp]) \n\t" - "lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition - "j 2f \n\t" - ".align 16 \n\t" - "1: \n\t" - - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - "vfmadb %%v16, %%v20, %%v28, %%v16 \n\t" - "vfmadb %%v17, %%v21, %%v28, %%v17 \n\t" - "vfmadb %%v18, %%v22, %%v28, %%v18 \n\t" - "vfmadb %%v19, %%v23, %%v28, %%v19 \n\t" - "vl %%v30, 64(%[t1],%[y_tmp]) \n\t" - "vl %%v31, 80(%[t1],%[y_tmp]) \n\t" - "vl %%v6 , 96(%[t1],%[y_tmp]) \n\t" - "vl %%v7 , 112(%[t1],%[y_tmp]) \n\t" - "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" - "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" - "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" - "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" - "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" - "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,0(%%r1,%2) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,32(%%r1,%2) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "vpdi %%v24,%%v16,%%v16,4 \n\t" + "vpdi %%v25,%%v17,%%v17,4 \n\t" + "vpdi %%v26,%%v18,%%v18,4 \n\t" + "vpdi %%v27,%%v19,%%v19,4 \n\t" - "vst %%v16 , 0(%[t1],%[y_tmp]) \n\t" - "vst %%v17 , 16(%[t1],%[y_tmp]) \n\t" - "vst %%v18 , 32(%[t1],%[y_tmp]) \n\t" - "vst %%v19 , 48(%[t1],%[y_tmp]) \n\t" - - "la %[t1],64(%[t1] ) \n\t" - "2: \n\t" - "pfd 1, 256(%[t1],%[x_tmp]) \n\t" - "pfd 2, 256(%[t1],%[y_tmp]) \n\t" - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" + "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" - "vfmadb %%v30, %%v20, %%v28, %%v30 \n\t" - "vfmadb %%v31, %%v21, %%v28, %%v31 \n\t" - "vfmadb %%v6, %%v22, %%v28, %%v6 \n\t" - "vfmadb %%v7, %%v23, %%v28, %%v7 \n\t" - "vl %%v16, 64(%[t1],%[y_tmp]) \n\t" - "vl %%v17, 80(%[t1],%[y_tmp]) \n\t" - "vl %%v18, 96(%[t1],%[y_tmp]) \n\t" - "vl %%v19, 112(%[t1],%[y_tmp]) \n\t" - "vfmadb %%v30, %%v24, %%v29, %%v30 \n\t" - "vfmadb %%v31, %%v25, %%v29, %%v31 \n\t" - "vfmadb %%v6, %%v26, %%v29, %%v6 \n\t" - "vfmadb %%v7, %%v27, %%v29, %%v7 \n\t" + "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" - "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" - "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" + "vst %%v28,0(%%r1,%2) \n\t" + "vst %%v29,16(%%r1,%2) \n\t" + "vst %%v30,32(%%r1,%2) \n\t" + "vst %%v31,48(%%r1,%2) \n\t" - "vst %%v30 , 0(%[t1],%[y_tmp]) \n\t" - "vst %%v31 , 16(%[t1],%[y_tmp]) \n\t" - "vst %%v6 , 32(%[t1],%[y_tmp]) \n\t" - "vst %%v7 , 48(%[t1],%[y_tmp]) \n\t" - - "la %[t1],64(%[t1] ) \n\t" - + "vl %%v16,64(%%r1,%1) \n\t" + "vl %%v17,80(%%r1,%1) \n\t" + "vl %%v18,96(%%r1,%1) \n\t" + "vl %%v19,112(%%r1,%1) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vpdi %%v24,%%v16,%%v16,4 \n\t" + "vpdi %%v25,%%v17,%%v17,4 \n\t" + "vpdi %%v26,%%v18,%%v18,4 \n\t" + "vpdi %%v27,%%v19,%%v19,4 \n\t" - "clgrjl %[t1],%[tmp],1b \n\t" -//---------------------------------------------------------------------- - "vfmadb %%v16, %%v20, %%v28, %%v16 \n\t" - "vfmadb %%v17, %%v21, %%v28, %%v17 \n\t" - "vfmadb %%v18, %%v22, %%v28, %%v18 \n\t" - "vfmadb %%v19, %%v23, %%v28, %%v19 \n\t" - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" - "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" - "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" - "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" + "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" - "vst %%v16 , 0(%[t1],%[y_tmp]) \n\t" - "vst %%v17 , 16(%[t1],%[y_tmp]) \n\t" - "vst %%v18 , 32(%[t1],%[y_tmp]) \n\t" - "vst %%v19 , 48(%[t1],%[y_tmp]) \n\t" + "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" - : [mem_y] "+m" (*(double (*)[2*n])y),[tmp]"+&r"(n) , [t1] "=&a" (tempR1) - : [mem_x] "m" (*(const double (*)[2*n])x), [x_tmp] "a"(x), [y_tmp] "a"(y), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) - : "cc", "v6","v7", "v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "vst %%v28,64(%%r1,%2) \n\t" + "vst %%v29,80(%%r1,%2) \n\t" + "vst %%v30,96(%%r1,%2) \n\t" + "vst %%v31,112(%%r1,%2) \n\t" + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } - int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0; BLASLONG ix = 0, iy = 0; + FLOAT da[2]; if (n <= 0) return (0); @@ -166,8 +122,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG n1 = n & -8; - if (n1) { - zaxpy_kernel_8(n1, x, y, da_r,da_i); + if (n1) { + da[0] = da_r; + da[1] = da_i; + zaxpy_kernel_8(n1, x, y, da); ix = 2 * n1; } i = n1; diff --git a/kernel/zarch/zcopy.c b/kernel/zarch/zcopy.c index b5bf383f7..8c940bba3 100644 --- a/kernel/zarch/zcopy.c +++ b/kernel/zarch/zcopy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -24,71 +24,28 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - + #include "common.h" - -static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { - - __asm__ volatile( - "pfd 1, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],4 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" - - "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" - - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_y] "=m" (*(double (*)[2*n])y), [n_tmp] "+&r"(n) - : [mem_x] "m" (*(const double (*)[2*n])x), [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; +static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "lgr %%r1,%1 \n\t" + "lgr %%r2,%2 \n\t" + "srlg %%r0,%0,4 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1) \n\t" + "pfd 2, 1024(%%r2) \n\t" + "mvc 0(256,%%r2),0(%%r1) \n\t" + "agfi %%r1,256 \n\t" + "agfi %%r2,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","r2" + ); } - int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; @@ -137,9 +94,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) } } - return(0); - + return(0); } - - diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index 61c5d6b98..aab18e2e9 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -23,137 +23,92 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - +*****************************************************************************/ #include "common.h" -#if defined(Z13) - -static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { +static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ __asm__ volatile( - "pfd 1, 0(%[ptr_x_tmp]) \n\t" - "pfd 1, 0(%[ptr_y_tmp]) \n\t" - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "srlg %[n_tmp],%[n_tmp],3 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t" - "pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t" - "vpdi %%v20,%%v16,%%v16,4 \n\t" - "vpdi %%v21,%%v17,%%v17,4 \n\t" - "vpdi %%v22,%%v18,%%v18,4 \n\t" - "vpdi %%v23,%%v19,%%v19,4 \n\t" + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "vzero %%v28 \n\t" + "vzero %%v29 \n\t" + "vzero %%v30 \n\t" + "vzero %%v31 \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vpdi %%v20,%%v16,%%v16,4 \n\t" + "vpdi %%v21,%%v17,%%v17,4 \n\t" + "vpdi %%v22,%%v18,%%v18,4 \n\t" + "vpdi %%v23,%%v19,%%v19,4 \n\t" - "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" - "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" - "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" - "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" - "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" - "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" - "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" - "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25 \n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26 \n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27 \n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28 \n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29 \n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30 \n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31 \n\t" + "vl %%v16, 64(%%r1,%1) \n\t" + "vl %%v17, 80(%%r1,%1) \n\t" + "vl %%v18, 96(%%r1,%1) \n\t" + "vl %%v19, 112(%%r1,%1) \n\t" + "vl %%v0, 64(%%r1,%2) \n\t" + "vl %%v1, 80(%%r1,%2) \n\t" + "vl %%v2, 96(%%r1,%2) \n\t" + "vl %%v3, 112(%%r1,%2) \n\t" + "vpdi %%v20,%%v16,%%v16,4 \n\t" + "vpdi %%v21,%%v17,%%v17,4 \n\t" + "vpdi %%v22,%%v18,%%v18,4 \n\t" + "vpdi %%v23,%%v19,%%v19,4 \n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25 \n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26 \n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27 \n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28 \n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29 \n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30 \n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31 \n\t" - "vl %%v16, 64(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v19,112(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v31,112(%%r1,%[ptr_y_tmp]) \n\t" - "vpdi %%v20,%%v16,%%v16,4 \n\t" - "vpdi %%v21,%%v17,%%v17,4 \n\t" - "vpdi %%v22,%%v18,%%v18,4 \n\t" - "vpdi %%v23,%%v19,%%v19,4 \n\t" - "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" - "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" - "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" - "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" - "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" - "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" - "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" - "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" - - - "la %%r1,128(%%r1) \n\t" - "brctg %[n_tmp],1b \n\t" - "vfadb %%v24,%%v26,%%v24 \n\t" - "vfadb %%v25,%%v25,%%v27 \n\t" - "vsteg %%v24, 0(%[ptr_d]),0 \n\t" - "vsteg %%v24, 8(%[ptr_d]),1 \n\t" - "vsteg %%v25,16(%[ptr_d]),1 \n\t" - "vsteg %%v25,24(%[ptr_d]),0 \n\t" - : [mem_out] "=m"(*(double (*)[4])d ) ,[n_tmp] "+&r"(n) - : [mem_x] "m"( *(const double (*)[2*n])x), - [mem_y] "m"( *(const double (*)[2*n])y), - [ptr_x_tmp] "a"(x), [ptr_y_tmp] "a"(y), [ptr_d] "a"(d) - : "cc", "r1","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vfadb %%v24,%%v24,%%v26 \n\t" + "vfadb %%v24,%%v24,%%v28 \n\t" + "vfadb %%v24,%%v24,%%v30 \n\t" + "vfadb %%v25,%%v25,%%v27 \n\t" + "vfadb %%v25,%%v25,%%v29 \n\t" + "vfadb %%v25,%%v25,%%v31 \n\t" + "vsteg %%v24,0(%3),0 \n\t" + "vsteg %%v24,8(%3),1 \n\t" + "vsteg %%v25,16(%3),1 \n\t" + "vsteg %%v25,24(%3),0 " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else - -static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { - BLASLONG register i = 0; - FLOAT dot[4] = {0.0, 0.0, 0.0, 0.0}; - BLASLONG j = 0; - - while (i < n) { - - dot[0] += x[j] * y[j]; - dot[1] += x[j + 1] * y[j + 1]; - dot[2] += x[j] * y[j + 1]; - dot[3] += x[j + 1] * y[j]; - - dot[0] += x[j + 2] * y[j + 2]; - dot[1] += x[j + 3] * y[j + 3]; - dot[2] += x[j + 2] * y[j + 3]; - dot[3] += x[j + 3] * y[j + 2]; - - dot[0] += x[j + 4] * y[j + 4]; - dot[1] += x[j + 5] * y[j + 5]; - dot[2] += x[j + 4] * y[j + 5]; - dot[3] += x[j + 5] * y[j + 4]; - - dot[0] += x[j + 6] * y[j + 6]; - dot[1] += x[j + 7] * y[j + 7]; - dot[2] += x[j + 6] * y[j + 7]; - dot[3] += x[j + 7] * y[j + 6]; - - j += 8; - i += 4; - - } - d[0] = dot[0]; - d[1] = dot[1]; - d[2] = dot[2]; - d[3] = dot[3]; - -} - -#endif - OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i = 0; - BLASLONG ix=0, iy=0; + BLASLONG i; + BLASLONG ix, iy; OPENBLAS_COMPLEX_FLOAT result; FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; @@ -167,14 +122,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA if ((inc_x == 1) && (inc_y == 1)) { BLASLONG n1 = n & -8; - BLASLONG j=0; - if (n1){ + if (n1) zdot_kernel_8(n1, x, y, dot); - i = n1; - j = n1 <<1; - } - + + i = n1; + BLASLONG j = i * 2; while (i < n) { diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index 380f0140e..75027a06c 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,176 +27,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA) +static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { - __asm__ ( - "pfd 2, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "lgdr %%r1,%[cos] \n\t" - "vlvgp %%v0,%%r1,%%r1 \n\t" - "lgdr %%r1,%[sin] \n\t" - "vlvgp %%v1,%%r1,%%r1 \n\t" - "sllg %[tmp],%[tmp],4 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 48(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v27,112(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v19,112(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 176(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 240(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 240(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 240(%%r1,%[ptr_y]) \n\t" - - "la %%r1,256(%%r1) \n\t" - "clgrjl %%r1,%[tmp],1b \n\t" - : [mem_x] "+m" (*(double (*)[2*n])x), - [mem_y] "+m" (*(double (*)[2*n])y), - [tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA) - : "cc","r1" ,"v0","v1","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - + __asm__ ( + "vlrepg %%v0,%3 \n\t" + "vlrepg %%v1,%4 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%1) \n\t" + "vst %%v29, 16(%%r1,%1) \n\t" + "vst %%v30, 32(%%r1,%1) \n\t" + "vst %%v31, 48(%%r1,%1) \n\t" + "vst %%v20, 0(%%r1,%2) \n\t" + "vst %%v21, 16(%%r1,%2) \n\t" + "vst %%v22, 32(%%r1,%2) \n\t" + "vst %%v23, 48(%%r1,%2) \n\t" + + "vl %%v24, 64(%%r1,%1) \n\t" + "vl %%v25, 80(%%r1,%1) \n\t" + "vl %%v26, 96(%%r1,%1) \n\t" + "vl %%v27, 112(%%r1,%1) \n\t" + "vl %%v16, 64(%%r1,%2) \n\t" + "vl %%v17, 80(%%r1,%2) \n\t" + "vl %%v18, 96(%%r1,%2) \n\t" + "vl %%v19, 112(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%1) \n\t" + "vst %%v29, 80(%%r1,%1) \n\t" + "vst %%v30, 96(%%r1,%1) \n\t" + "vst %%v31, 112(%%r1,%1) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%1) \n\t" + "vst %%v29, 144(%%r1,%1) \n\t" + "vst %%v30, 160(%%r1,%1) \n\t" + "vst %%v31, 176(%%r1,%1) \n\t" + "vst %%v20, 128(%%r1,%2) \n\t" + "vst %%v21, 144(%%r1,%2) \n\t" + "vst %%v22, 160(%%r1,%2) \n\t" + "vst %%v23, 176(%%r1,%2) \n\t" + + "vl %%v24, 192(%%r1,%1) \n\t" + "vl %%v25, 208(%%r1,%1) \n\t" + "vl %%v26, 224(%%r1,%1) \n\t" + "vl %%v27, 240(%%r1,%1) \n\t" + "vl %%v16, 192(%%r1,%2) \n\t" + "vl %%v17, 208(%%r1,%2) \n\t" + "vl %%v18, 224(%%r1,%2) \n\t" + "vl %%v19, 240(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 192(%%r1,%1) \n\t" + "vst %%v29, 208(%%r1,%1) \n\t" + "vst %%v30, 224(%%r1,%1) \n\t" + "vst %%v31, 240(%%r1,%1) \n\t" + "vst %%v20, 192(%%r1,%2) \n\t" + "vst %%v21, 208(%%r1,%2) \n\t" + "vst %%v22, 224(%%r1,%2) \n\t" + "vst %%v23, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) @@ -214,8 +204,11 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT BLASLONG n1 = n & -16; if ( n1 > 0 ) - { - zrot_kernel_16(n1, x, y, c, s); + { + FLOAT cosa,sina; + cosa=c; + sina=s; + zrot_kernel_16(n1, x, y, &cosa, &sina); i=n1; ix=2*n1; } @@ -234,6 +227,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } + } else { @@ -259,3 +253,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } + diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index 4764c0a52..4d8ee960f 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -23,270 +23,211 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ #include "common.h" +static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlrepg %%v0,0(%1) \n\t" + "vleg %%v1,8(%1),0 \n\t" + "wflcdb %%v1,%%v1 \n\t" + "vleg %%v1,8(%1),1 \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vpdi %%v24,%%v16,%%v16,4 \n\t" + "vpdi %%v25,%%v17,%%v17,4 \n\t" + "vpdi %%v26,%%v18,%%v18,4 \n\t" + "vpdi %%v27,%%v19,%%v19,4 \n\t" + "vpdi %%v28,%%v20,%%v20,4 \n\t" + "vpdi %%v29,%%v21,%%v21,4 \n\t" + "vpdi %%v30,%%v22,%%v22,4 \n\t" + "vpdi %%v31,%%v23,%%v23,4 \n\t" + + "vfmdb %%v16,%%v16,%%v0 \n\t" + "vfmdb %%v17,%%v17,%%v0 \n\t" + "vfmdb %%v18,%%v18,%%v0 \n\t" + "vfmdb %%v19,%%v19,%%v0 \n\t" + "vfmdb %%v20,%%v20,%%v0 \n\t" + "vfmdb %%v21,%%v21,%%v0 \n\t" + "vfmdb %%v22,%%v22,%%v0 \n\t" + "vfmdb %%v23,%%v23,%%v0 \n\t" + "vfmadb %%v16,%%v24,%%v1,%%v16 \n\t" + "vfmadb %%v17,%%v25,%%v1,%%v17 \n\t" + "vfmadb %%v18,%%v26,%%v1,%%v18 \n\t" + "vfmadb %%v19,%%v27,%%v1,%%v19 \n\t" + "vfmadb %%v20,%%v28,%%v1,%%v20 \n\t" + "vfmadb %%v21,%%v29,%%v1,%%v21 \n\t" + "vfmadb %%v22,%%v30,%%v1,%%v22 \n\t" + "vfmadb %%v23,%%v31,%%v1,%%v23 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} +static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vleg %%v0,8(%1),0 \n\t" + "wflcdb %%v0,%%v0 \n\t" + "vleg %%v0,8(%1),1 \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" -static void zscal_kernel_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x) { - BLASLONG tempR1 ; - __asm__ ( - "pfd 2, 0(%[x_tmp]) \n\t" -#if !defined(CONJ) - "lgdr %[t1],%[alpha_r] \n\t" - "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint - "lgdr %[t1],%[alpha_i] \n\t" - "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint - "vflcdb %%v29,%%v29 \n\t" //complement both - "vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i} + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vpdi %%v16,%%v16,%%v16,4 \n\t" + "vpdi %%v17,%%v17,%%v17,4 \n\t" + "vpdi %%v18,%%v18,%%v18,4 \n\t" + "vpdi %%v19,%%v19,%%v19,4 \n\t" + "vpdi %%v20,%%v20,%%v20,4 \n\t" + "vpdi %%v21,%%v21,%%v21,4 \n\t" + "vpdi %%v22,%%v22,%%v22,4 \n\t" + "vpdi %%v23,%%v23,%%v23,4 \n\t" -#else - "lgdr %[t1],%[alpha_i] \n\t" - "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint - "lgdr %[t1],%[alpha_r] \n\t" - "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint - "vflcdb %%v28,%%v28 \n\t" //complement both - "vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r} -#endif - - "xgr %[t1],%[t1] \n\t" - "sllg %[tmp],%[tmp],4 \n\t" - "vl %%v20 , 0(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 16(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 32(%[t1],%[x_tmp]) \n\t" - "vl %%v23 , 48(%[t1],%[x_tmp]) \n\t" - - "lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition - "j 2f \n\t" - ".align 16 \n\t" - "1: \n\t" - - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - "vfmdb %%v16, %%v20, %%v28 \n\t" - "vfmdb %%v17, %%v21, %%v28 \n\t" - "vfmdb %%v18, %%v22, %%v28 \n\t" - "vfmdb %%v19, %%v23, %%v28 \n\t" - "vl %%v20, 64(%[t1],%[x_tmp]) \n\t" - "vl %%v21, 80(%[t1],%[x_tmp]) \n\t" - "vl %%v22, 96(%[t1],%[x_tmp]) \n\t" - "vl %%v23, 112(%[t1],%[x_tmp]) \n\t" - "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" - "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" - "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" - "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" + "vfmdb %%v16,%%v16,%%v0 \n\t" + "vfmdb %%v17,%%v17,%%v0 \n\t" + "vfmdb %%v18,%%v18,%%v0 \n\t" + "vfmdb %%v19,%%v19,%%v0 \n\t" + "vfmdb %%v20,%%v20,%%v0 \n\t" + "vfmdb %%v21,%%v21,%%v0 \n\t" + "vfmdb %%v22,%%v22,%%v0 \n\t" + "vfmdb %%v23,%%v23,%%v0 \n\t" + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" - "vst %%v16 , 0(%[t1],%[x_tmp]) \n\t" - "vst %%v17 , 16(%[t1],%[x_tmp]) \n\t" - "vst %%v18 , 32(%[t1],%[x_tmp]) \n\t" - "vst %%v19 , 48(%[t1],%[x_tmp]) \n\t" + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); +} + +static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlrepg %%v0,0(%1) \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfmdb %%v16,%%v16,%%v0 \n\t" + "vfmdb %%v17,%%v17,%%v0 \n\t" + "vfmdb %%v18,%%v18,%%v0 \n\t" + "vfmdb %%v19,%%v19,%%v0 \n\t" + "vfmdb %%v20,%%v20,%%v0 \n\t" + "vfmdb %%v21,%%v21,%%v0 \n\t" + "vfmdb %%v22,%%v22,%%v0 \n\t" + "vfmdb %%v23,%%v23,%%v0 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); +} + +static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) +{ + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + + "vst %%v24,0(%%r1,%1) \n\t" + "vst %%v25,16(%%r1,%1) \n\t" + "vst %%v26,32(%%r1,%1) \n\t" + "vst %%v27,48(%%r1,%1) \n\t" + "vst %%v24,64(%%r1,%1) \n\t" + "vst %%v25,80(%%r1,%1) \n\t" + "vst %%v26,96(%%r1,%1) \n\t" + "vst %%v27,112(%%r1,%1) \n\t" - "la %[t1],64(%[t1] ) \n\t" - "2: \n\t" - "pfd 2, 256(%[t1],%[x_tmp]) \n\t" - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - - "vfmdb %%v30, %%v20, %%v28 \n\t" - "vfmdb %%v31, %%v21, %%v28 \n\t" - "vfmdb %%v6, %%v22, %%v28 \n\t" - "vfmdb %%v7, %%v23, %%v28 \n\t" - - "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" - "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" - - "vfmadb %%v30, %%v24, %%v29, %%v30 \n\t" - "vfmadb %%v31, %%v25, %%v29, %%v31 \n\t" - "vfmadb %%v6, %%v26, %%v29, %%v6 \n\t" - "vfmadb %%v7, %%v27, %%v29, %%v7 \n\t" - - - "vst %%v30 , 0(%[t1],%[x_tmp]) \n\t" - "vst %%v31 , 16(%[t1],%[x_tmp]) \n\t" - "vst %%v6 , 32(%[t1],%[x_tmp]) \n\t" - "vst %%v7 , 48(%[t1],%[x_tmp]) \n\t" - - "la %[t1],64(%[t1] ) \n\t" - - - "clgrjl %[t1],%[tmp],1b \n\t" -//---------------------------------------------------------------------- - "vfmdb %%v16, %%v20, %%v28 \n\t" - "vfmdb %%v17, %%v21, %%v28 \n\t" - "vfmdb %%v18, %%v22, %%v28 \n\t" - "vfmdb %%v19, %%v23, %%v28 \n\t" - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" - "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" - "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" - "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" - - "vst %%v16 , 0(%[t1],%[x_tmp]) \n\t" - "vst %%v17 , 16(%[t1],%[x_tmp]) \n\t" - "vst %%v18 , 32(%[t1],%[x_tmp]) \n\t" - "vst %%v19 , 48(%[t1],%[x_tmp]) \n\t" - - : [mem_x] "+m" (*(double (*)[2*n])x),[tmp]"+&r"(n) , [t1] "=&a" (tempR1) - : [x_tmp] "a"(x), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) - : "cc", "v6","v7", "v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - - -} - -static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT da_i, FLOAT *x) { - - __asm__ ( "pfd 2, 0(%1) \n\t" - "lgdr %%r0,%[alpha] \n\t" - "vlvgp %%v16,%%r0,%%r0 \n\t" //load both from disjoint - "vflcdb %%v16,%%v16 \n\t" //complement both - "vlvgg %%v16,%%r0,0 \n\t" //restore 1st - "vlr %%v17 ,%%v16 \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "vl %%v24, 0(%[x_ptr]) \n\t" - "vfmdb %%v24,%%v24,%%v16 \n\t" - "vsteg %%v24, 0(%[x_ptr]),1 \n\t" - "vsteg %%v24, 8(%[x_ptr]),0 \n\t" - "vl %%v25, 16(%[x_ptr]) \n\t" - "vfmdb %%v25,%%v25,%%v17 \n\t" - "vsteg %%v25, 16(%[x_ptr]),1 \n\t" - "vsteg %%v25, 24(%[x_ptr]),0 \n\t" - "vl %%v26, 32(%[x_ptr]) \n\t" - "vfmdb %%v26,%%v26,%%v16 \n\t" - "vsteg %%v26, 32(%[x_ptr]),1 \n\t" - "vsteg %%v26, 40(%[x_ptr]),0 \n\t" - "vl %%v27, 48(%[x_ptr]) \n\t" - "vfmdb %%v27,%%v27,%%v17 \n\t" - "vsteg %%v27, 48(%[x_ptr]),1 \n\t" - "vsteg %%v27, 56(%[x_ptr]),0 \n\t" - "vl %%v28, 64(%[x_ptr]) \n\t" - "vfmdb %%v28,%%v28,%%v16 \n\t" - "vsteg %%v28, 64(%[x_ptr]),1 \n\t" - "vsteg %%v28, 72(%[x_ptr]),0 \n\t" - "vl %%v29, 80(%[x_ptr]) \n\t" - "vfmdb %%v29,%%v29,%%v17 \n\t" - "vsteg %%v29, 80(%[x_ptr]),1 \n\t" - "vsteg %%v29, 88(%[x_ptr]),0 \n\t" - "vl %%v30, 96(%[x_ptr]) \n\t" - "vfmdb %%v30,%%v30,%%v16 \n\t" - "vsteg %%v30, 96(%[x_ptr]),1 \n\t" - "vsteg %%v30, 104(%[x_ptr]),0 \n\t" - "vl %%v31, 112(%[x_ptr]) \n\t" - "vfmdb %%v31,%%v31,%%v17 \n\t" - "vsteg %%v31, 112(%[x_ptr]),1 \n\t" - "vsteg %%v31, 120(%[x_ptr]),0 \n\t" - "la %[x_ptr],128(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x) - : [n] "r"(n),[alpha] "f"(da_i) - :"cc", "r0","f0", "f1","v16","v17" ,"v24","v25","v26","v27","v28","v29","v30","v31" - ); - - + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v24","v25","v26","v27" + ); } -static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT da_r, FLOAT *x) { - __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" - "lgdr %%r0,%[alpha] \n\t" - "vlvgp %%v18,%%r0,%%r0 \n\t" - "vlr %%v19,%%v18 \n\t" - "vlr %%v16,%%v18 \n\t" - "vlr %%v17,%%v18 \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "vl %%v24, 0(%[x_ptr]) \n\t" - "vfmdb %%v24,%%v24,%%v18 \n\t" - "vst %%v24, 0(%[x_ptr]) \n\t" - "vl %%v25, 16(%[x_ptr]) \n\t" - "vfmdb %%v25,%%v25,%%v19 \n\t" - "vst %%v25, 16(%[x_ptr]) \n\t" - "vl %%v26, 32(%[x_ptr]) \n\t" - "vfmdb %%v26,%%v26,%%v16 \n\t" - "vst %%v26, 32(%[x_ptr]) \n\t" - "vl %%v27, 48(%[x_ptr]) \n\t" - "vfmdb %%v27,%%v27,%%v17 \n\t" - "vst %%v27, 48(%[x_ptr]) \n\t" - "vl %%v28, 64(%[x_ptr]) \n\t" - "vfmdb %%v28,%%v28,%%v18 \n\t" - "vst %%v28, 64(%[x_ptr]) \n\t" - "vl %%v29, 80(%[x_ptr]) \n\t" - "vfmdb %%v29,%%v29,%%v19 \n\t" - "vst %%v29, 80(%[x_ptr]) \n\t" - "vl %%v30, 96(%[x_ptr]) \n\t" - "vfmdb %%v30,%%v30,%%v16 \n\t" - "vst %%v30, 96(%[x_ptr]) \n\t" - "vl %%v31,112(%[x_ptr]) \n\t" - "vfmdb %%v31,%%v31,%%v17 \n\t" - "vst %%v31,112(%[x_ptr]) \n\t" - "la %[x_ptr],128(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x) - : [n] "r"(n),[alpha] "f"(da_r) - : "cc", "r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31" - ); - -} - -static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { - - __asm__ ( "pfd 2, 0(%[x_ptr]) \n\t" - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256( %[x_ptr]) \n\t" - "vst %%v24, 0( %[x_ptr]) \n\t" - "vst %%v25, 16( %[x_ptr]) \n\t" - "vst %%v26, 32( %[x_ptr]) \n\t" - "vst %%v27, 48( %[x_ptr]) \n\t" - "vst %%v24, 64( %[x_ptr]) \n\t" - "vst %%v25, 80( %[x_ptr]) \n\t" - "vst %%v26, 96( %[x_ptr]) \n\t" - "vst %%v27,112( %[x_ptr]) \n\t" - - "la %[x_ptr],128(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "+m" (*(double (*)[2*n])x),[x_ptr] "+&a"(x) - : [n] "r"(n) - :"cc" ,"r0","v24","v25","v26","v27" - ); - -} - - - - - -static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x) { - +static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) +{ BLASLONG i; BLASLONG inc_x2 = 2 * inc_x; BLASLONG inc_x3 = inc_x2 + inc_x; - FLOAT t0, t1, t2, t3; + FLOAT t0, t1, t2, t3; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; - for (i = 0; i < n; i += 4) { + for (i = 0; i < n; i += 4) + { t0 = da_r * x[0] - da_i * x[1]; t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; @@ -303,17 +244,14 @@ static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLAS x[inc_x3] = t3; x += 4 * inc_x; - } - - } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0, j = 0; FLOAT temp0; FLOAT temp1; - + FLOAT alpha[2] __attribute__ ((aligned(16))); if (inc_x != 1) { inc_x <<= 1; @@ -405,8 +343,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } else { BLASLONG n1 = n & -8; - if (n1 > 0) { - zscal_kernel_inc_8(n1, da_r,da_i, x, inc_x); + if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; + zscal_kernel_inc_8(n1, alpha, x, inc_x); j = n1; i = n1 * inc_x; } @@ -432,17 +372,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG n1 = n & -8; if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; if (da_r == 0.0) if (da_i == 0) zscal_kernel_8_zero(n1, x); else - zscal_kernel_8_zero_r(n1, da_i, x); + zscal_kernel_8_zero_r(n1, alpha, x); else if (da_i == 0) - zscal_kernel_8_zero_i(n1, da_r, x); + zscal_kernel_8_zero_i(n1, alpha, x); else - zscal_kernel_8(n1, da_r,da_i, x); + zscal_kernel_8(n1, alpha, x); i = n1 << 1; j = n1; @@ -508,5 +450,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, return (0); } - - diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index 062079002..a16b87cdc 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -25,220 +25,93 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" - -#if defined(Z13_SWAP_A) -static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { - __asm__ volatile( - "pfd 1, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],4 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v16, 0(%%r1,%[ptr_x]) \n\t" + __asm__ volatile( + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 16(%%r1,%[ptr_x]) \n\t" + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vl %%v4, 64(%%r1,%2) \n\t" + "vl %%v5, 80(%%r1,%2) \n\t" + "vl %%v6, 96(%%r1,%2) \n\t" + "vl %%v7, 112(%%r1,%2) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v1, 16(%%r1,%1) \n\t" + "vst %%v2, 32(%%r1,%1) \n\t" + "vst %%v3, 48(%%r1,%1) \n\t" + "vst %%v4, 64(%%r1,%1) \n\t" + "vst %%v5, 80(%%r1,%1) \n\t" + "vst %%v6, 96(%%r1,%1) \n\t" + "vst %%v7, 112(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 32(%%r1,%[ptr_x]) \n\t" - - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 48(%%r1,%[ptr_x]) \n\t" - - "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_x]) \n\t" - - "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_x]) \n\t" - - "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_x]) \n\t" - - "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_x]) \n\t" - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v16, 128(%%r1,%[ptr_x]) \n\t" - - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 144(%%r1,%[ptr_x]) \n\t" - - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 160(%%r1,%[ptr_x]) \n\t" - - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 176(%%r1,%[ptr_x]) \n\t" - - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 192(%%r1,%[ptr_x]) \n\t" - - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 208(%%r1,%[ptr_x]) \n\t" - - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 224(%%r1,%[ptr_x]) \n\t" - - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 240(%%r1,%[ptr_x]) \n\t" - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[2*n])x), - [mem_y] "+m" (*(double (*)[2*n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23" - ,"v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; + "vl %%v0, 128(%%r1,%2) \n\t" + "vl %%v1, 144(%%r1,%2) \n\t" + "vl %%v2, 160(%%r1,%2) \n\t" + "vl %%v3, 176(%%r1,%2) \n\t" + "vl %%v4, 192(%%r1,%2) \n\t" + "vl %%v5, 208(%%r1,%2) \n\t" + "vl %%v6, 224(%%r1,%2) \n\t" + "vl %%v7, 240(%%r1,%2) \n\t" + "vst %%v0, 128(%%r1,%1) \n\t" + "vst %%v1, 144(%%r1,%1) \n\t" + "vst %%v2, 160(%%r1,%1) \n\t" + "vst %%v3, 176(%%r1,%1) \n\t" + "vst %%v4, 192(%%r1,%1) \n\t" + "vst %%v5, 208(%%r1,%1) \n\t" + "vst %%v6, 224(%%r1,%1) \n\t" + "vst %%v7, 240(%%r1,%1) \n\t" + "vst %%v16, 0(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else - -static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "pfd 2, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],4 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v16, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - - - "vl %%v0, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v1, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v2, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v3, 48(%%r1,%[ptr_y]) \n\t" - "vl %%v4, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v5, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v6, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v7, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v0, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v1, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v2, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v3, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v4, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v5, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v6, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v7, 112(%%r1,%[ptr_x]) \n\t" - - "vl %%v0, 128(%%r1,%[ptr_y]) \n\t" - "vl %%v1, 144(%%r1,%[ptr_y]) \n\t" - "vl %%v2, 160(%%r1,%[ptr_y]) \n\t" - "vl %%v3, 176(%%r1,%[ptr_y]) \n\t" - "vl %%v4, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v5, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v6, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v7, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v0, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v1, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v2, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v3, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v4, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v5, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v6, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v7, 240(%%r1,%[ptr_x]) \n\t" - - "vst %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[2*n])x), - [mem_y] "+m" (*(double (*)[2*n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - -} - -#endif - - - - - - int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; diff --git a/ztest/Makefile b/ztest/Makefile new file mode 100644 index 000000000..0ff7fe46a --- /dev/null +++ b/ztest/Makefile @@ -0,0 +1,437 @@ +TOPDIR = .. +include $(TOPDIR)/Makefile.system + +goto :: sdot.goto ddot.goto cdot.goto zdot.goto dsdot.goto sswap.goto dswap.goto cswap.goto zswap.goto isamax.goto idamax.goto icamax.goto izamax.goto samax.goto damax.goto ismax.goto idmax.goto smax.goto dmax.goto isamin.goto idamin.goto icamin.goto izamin.goto samin.goto damin.goto camin.goto zamin.goto ismin.goto idmin.goto smin.goto dmin.goto sgemv.goto dgemv.goto cgemv.goto zgemv.goto sscal.goto dscal.goto cscal.goto zscal.goto saxpy.goto daxpy.goto caxpy.goto zaxpy.goto srot.goto drot.goto crot.goto zrot.goto sasum.goto dasum.goto casum.goto zasum.goto scopy.goto dcopy.goto ccopy.goto zcopy.goto + +##################################### Sdot #################################################### +sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Ddot #################################################### +ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Cdot #################################################### +cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zdot #################################################### +zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dsdot #################################################### +dsdot.goto : dsdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISAMAX ############################################## +isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDAMAX ############################################## +idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ICAMAX ############################################## +icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IZAMAX ############################################## +izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SAMAX ############################################## +samax.goto : samax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DAMAX ############################################## +damax.goto : damax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISMAX ############################################## +ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDMAX ############################################## +idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SMAX ############################################## +smax.goto : smax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DMAX ############################################## +dmax.goto : dmax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISAMIN ############################################## +isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDAMIN ############################################## +idamin.goto : idamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ICAMIN ############################################## +icamin.goto : icamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IZAMIN ############################################## +izamin.goto : izamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SAMIN ############################################## +samin.goto : samin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DAMIN ############################################## +damin.goto : damin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## CAMIN ############################################## +camin.goto : camin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ZAMIN ############################################## +zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISMIN ############################################## +ismin.goto : ismin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDMIN ############################################## +idmin.goto : idmin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SMIN ############################################## +smin.goto : smin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DMIN ############################################## +dmin.goto : dmin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Sgemv #################################################### +sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dgemv #################################################### +dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Cgemv #################################################### + +cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zgemv #################################################### + +zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Sscal #################################################### +sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dscal #################################################### +dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Cscal #################################################### + +cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zscal #################################################### + +zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Saxpy #################################################### +saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Daxpy #################################################### +daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Caxpy #################################################### + +caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zaxpy #################################################### + +zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Srot #################################################### +srot.goto : srot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Drot #################################################### +drot.goto : drot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Crot #################################################### +crot.goto : crot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zrot #################################################### +zrot.goto : zrot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Sswap #################################################### +sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dswap #################################################### +dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Cswap #################################################### + +cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zswap #################################################### + +zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Saxpy #################################################### +saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Daxpy #################################################### +daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Caxpy #################################################### + +caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zaxpy #################################################### + +zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Sasum #################################################### +sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dasum #################################################### +dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Casum #################################################### + +casum.goto : casum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zasum #################################################### + +zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Scopy #################################################### +scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dcopy #################################################### +dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Ccopy #################################################### + +ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zcopy #################################################### + +zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +################################################################################################### + +sdot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +ddot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cdot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +dsdot.$(SUFFIX) : dsdot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +isamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +icamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +izamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +samax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +damax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ismax.$(SUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idmax.$(SUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +smax.$(SUFFIX) : max.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dmax.$(SUFFIX) : max.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +isamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +icamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +izamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +samin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +damin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +camin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zamin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +ismin.$(SUFFIX) : imin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idmin.$(SUFFIX) : imin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +smin.$(SUFFIX) : min.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dmin.$(SUFFIX) : min.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +sgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +saxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +daxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +caxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +srot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +drot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +crot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zrot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +saxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +daxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +caxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +casum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +scopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dcopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ccopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zcopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +clean :: + @rm -f *.goto + diff --git a/ztest/amax.c b/ztest/amax.c new file mode 100644 index 000000000..f2e3f5411 --- /dev/null +++ b/ztest/amax.c @@ -0,0 +1,235 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +FLOAT amax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + maxf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) > maxf ) + { + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(maxf); +} + +#undef AMAX +#ifdef DOUBLE +#define AMAX BLASFUNC(damax) +#else +#define AMAX BLASFUNC(samax) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +FLOAT amin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + minf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) < minf ) + { + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(minf); +} + +#undef AMIN +#ifdef DOUBLE +#define AMIN BLASFUNC(damin) +#else +#define AMIN BLASFUNC(samin) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#ifdef COMPLEX +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +FLOAT zasum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + sumf += CABS1(x,i); + i += inc_x2; + } + return(sumf); +} +#else +FLOAT asum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + return(sumf); +} +#endif + +#undef ASUM +#ifdef COMPLEX +#ifdef DOUBLE +#define ASUM BLASFUNC(dzasum) +#else +#define ASUM BLASFUNC(scasum) +#endif +#else +#ifdef DOUBLE +#define ASUM BLASFUNC(dasum) +#else +#define ASUM BLASFUNC(sasum) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zaxpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + if ( da_r == 0.0 && da_i == 0.0 ) return(0); + + ix = 0; + iy = 0; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} +#else +int axpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + + if ( n < 0 ) return(0); + if ( da == 0.0 ) return(0); + + ix = 0; + iy = 0; + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} +#endif + +#undef AXPY +#ifdef COMPLEX +#ifdef DOUBLE +#define AXPY BLASFUNC(zaxpy) +#else +#define AXPY BLASFUNC(caxpy) +#endif +#else +#ifdef DOUBLE +#define AXPY BLASFUNC(daxpy) +#else +#define AXPY BLASFUNC(saxpy) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y, *y_c;; + FLOAT alpha[2] = { 2.0, 2.0 }; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + argc--;argv++; + + blasint iy; + int test = 1; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zcopy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2; + iy += inc_y2; + i++ ; + + } + return(0); + +} +#else +int copy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n < 0 ) return(0); + + while(i < n) + { + + y[iy] = x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} +#endif + +#undef COPY +#ifdef COMPLEX +#ifdef DOUBLE +#define COPY BLASFUNC(zcopy) +#else +#define COPY BLASFUNC(ccopy) +#endif +#else +#ifdef DOUBLE +#define COPY BLASFUNC(dcopy) +#else +#define COPY BLASFUNC(scopy) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y, *y_c; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint iy; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +OPENBLAS_COMPLEX_FLOAT zdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT dot[2]; + OPENBLAS_COMPLEX_FLOAT result; + BLASLONG inc_x2; + BLASLONG inc_y2; + + dot[0]=0.0; + dot[1]=0.0; + + CREAL(result) = 0.0 ; + CIMAG(result) = 0.0 ; + + if ( n < 1 ) return(result); + + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + + while(i < n) + { +#if !defined(CONJ) + dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ; + dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ; +#else + dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ; + dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + CREAL(result) = dot[0]; + CIMAG(result) = dot[1]; + return(result); + +} +#else +FLOAT dot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT dot = 0.0 ; + + if ( n < 0 ) return(dot); + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); +} +#endif + +#undef DOT +#ifdef COMPLEX +#ifdef DOUBLE +#define DOT BLASFUNC(zdotu) +#else +#define DOT BLASFUNC(cdotu) +#endif +#else +#ifdef DOUBLE +#define DOT BLASFUNC(ddot) +#else +#define DOT BLASFUNC(sdot) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; +#ifdef COMPLEX + OPENBLAS_COMPLEX_FLOAT result, result_c; +#else + FLOAT result, result_c; +#endif + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +double dsdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + double dot = 0.0 ; + + if ( n < 0 ) return(dot); + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); +} + +#undef DSDOT +#define DSDOT BLASFUNC(dsdot) + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; + double result, result_c; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zgemv_n_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp_r,temp_i; + BLASLONG inc_x2,inc_y2; + BLASLONG lda2; + BLASLONG i2; + + lda2 = 2*lda; + + ix = 0; + a_ptr = a; + + if ( inc_x == 1 && inc_y == 1 ) + { + + for (j=0; jtv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x, *y, *y_c; + FLOAT alpha[] = {1.0, 1.0}; + FLOAT beta [] = {1.0, 1.0}; + char trans='N'; + blasint m, i, j; + blasint inc_x=1,inc_y=1; + blasint n=0; + int has_param_n = 0; + int has_param_m = 0; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint iy; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + + int tomax = to; + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; + if ((p = getenv("OPENBLAS_PARAM_N"))) { + n = atoi(p); + if ((n>0)) has_param_n = 1; + if ( n > tomax ) tomax = n; + } + if ( has_param_n == 0 ) + if ((p = getenv("OPENBLAS_PARAM_M"))) { + m = atoi(p); + if ((m>0)) has_param_m = 1; + if ( m > tomax ) tomax = m; + } + + + + fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + if (has_param_m == 0) + { + + for(m = from; m <= to; m += step) + { + timeg=0; + timeg_c=0; + if ( has_param_n == 0 ) n = m; + fprintf(stderr, " %6dx%d :", (int)m,(int)n); + for(j = 0; j < m; j++){ + for(i = 0; i < n * COMPSIZE; i++){ + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#ifdef COMPLEX +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +BLASLONG izamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf; + BLASLONG max=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(max+1); +} +#else +BLASLONG iamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) > maxf ) + { + max = i; + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(max+1); +} +#endif + +#undef IAMAX +#ifdef COMPLEX +#ifdef DOUBLE +#define IAMAX BLASFUNC(izamax) +#else +#define IAMAX BLASFUNC(icamax) +#endif +#else +#ifdef DOUBLE +#define IAMAX BLASFUNC(idamax) +#else +#define IAMAX BLASFUNC(isamax) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + BLASLONG result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#ifdef COMPLEX +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +BLASLONG izamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf; + BLASLONG min=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(min); + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(min+1); +} +#else +BLASLONG iamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) < minf ) + { + min = i; + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(min+1); +} +#endif + +#undef IAMIN +#ifdef COMPLEX +#ifdef DOUBLE +#define IAMIN BLASFUNC(izamin) +#else +#define IAMIN BLASFUNC(icamin) +#endif +#else +#ifdef DOUBLE +#define IAMIN BLASFUNC(idamin) +#else +#define IAMIN BLASFUNC(isamin) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + BLASLONG result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +BLASLONG imax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + max = i; + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(max+1); +} + +#undef IMAX +#ifdef DOUBLE +#define IMAX BLASFUNC(idmax) +#else +#define IMAX BLASFUNC(ismax) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + BLASLONG result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +BLASLONG imin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] < minf ) + { + min = i; + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(min+1); +} + +#undef IMIN +#ifdef DOUBLE +#define IMIN BLASFUNC(idmin) +#else +#define IMIN BLASFUNC(ismin) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + BLASLONG result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +FLOAT max_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(maxf); +} + +#undef MAX_ +#ifdef DOUBLE +#define MAX_ BLASFUNC(dmax) +#else +#define MAX_ BLASFUNC(smax) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +FLOAT min_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] < minf ) + { + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(minf); +} + +#undef MIN_ +#ifdef DOUBLE +#define MIN_ BLASFUNC(dmin) +#else +#define MIN_ BLASFUNC(smin) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zrot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n <= 0 ) return(0); + + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); +} +#else +int rot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n <= 0 ) return(0); + + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); +} +#endif + +#undef ROT +#ifdef COMPLEX +#ifdef DOUBLE +#define ROT BLASFUNC(zdrot) +#else +#define ROT BLASFUNC(csrot) +#endif +#else +#ifdef DOUBLE +#define ROT BLASFUNC(drot) +#else +#define ROT BLASFUNC(srot) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y, *x_c, *y_c; + // FLOAT result; + blasint m, i; + blasint inc_x=1,inc_y=1; + FLOAT c[1] = { 2.0 }; + FLOAT s[1] = { 2.0 }; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint ix,iy; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zscal_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG inc_x2; + BLASLONG ip = 0; + FLOAT temp; + + if ( (n <= 0) || (inc_x <= 0)) + return(0); + + inc_x2 = 2 * inc_x; + for ( i=0; itv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *x_c; + FLOAT alpha[2] = { 2.0, 2.0 }; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint ix; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zswap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); +} +#else +int swap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n < 0 ) return(0); + + while(i < n) + { + + temp = x[ix] ; + x[ix] = y[iy] ; + y[iy] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); +} +#endif + +#undef SWAP +#ifdef COMPLEX +#ifdef DOUBLE +#define SWAP BLASFUNC(zswap) +#else +#define SWAP BLASFUNC(cswap) +#endif +#else +#ifdef DOUBLE +#define SWAP BLASFUNC(dswap) +#else +#define SWAP BLASFUNC(sswap) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y, *x_c, *y_c; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint ix,iy; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l Date: Mon, 6 Aug 2018 20:03:49 +0300 Subject: [PATCH 02/26] [ZARCH] Restore detect() function --- cpuid_zarch.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 0ae32f27d..073419fa8 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -45,9 +45,29 @@ static char *cpuname_lower[] = { int detect(void) { - // return CPU_GENERIC; - return CPU_Z14; - + FILE *infile; + char buffer[512], *p; + + p = (char *)NULL; + infile = fopen("/proc/sysinfo", "r"); + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("Type", buffer, 4)){ + p = strchr(buffer, ':') + 2; +#if 0 + fprintf(stderr, "%s\n", p); +#endif + break; + } + } + + fclose(infile); + + if (strstr(p, "2964")) return CPU_Z13; + if (strstr(p, "2965")) return CPU_Z13; + if (strstr(p, "3906")) return CPU_Z14; + if (strstr(p, "3907")) return CPU_Z14; + + return CPU_GENERIC; } void get_libname(void) From e6c0e39492d49eded5a72c9882b79bed7bff35d0 Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 13 Aug 2018 12:23:40 +0300 Subject: [PATCH 03/26] Optimize Zgemv --- cpuid_zarch.c | 8 +- kernel/zarch/KERNEL.Z13 | 4 +- kernel/zarch/KERNEL.Z14 | 8 +- kernel/zarch/camax.c | 46 +- kernel/zarch/camin.c | 46 +- kernel/zarch/caxpy.c | 4 +- kernel/zarch/cgemv_n_4.c | 743 ++++++++++++++++++++ kernel/zarch/cgemv_t_4.c | 671 ++++++++++++++++++ kernel/zarch/icamax.c | 9 +- kernel/zarch/icamin.c | 9 +- kernel/zarch/idamax.c | 11 +- kernel/zarch/idamin.c | 11 +- kernel/zarch/idmax.c | 11 +- kernel/zarch/idmin.c | 11 +- kernel/zarch/isamax.c | 11 +- kernel/zarch/isamin.c | 11 +- kernel/zarch/ismax.c | 11 +- kernel/zarch/ismin.c | 11 +- kernel/zarch/izamax.c | 9 +- kernel/zarch/izamin.c | 9 +- kernel/zarch/zamax.c | 48 +- kernel/zarch/zamin.c | 46 +- kernel/zarch/zaxpy.c | 4 +- kernel/zarch/zgemv_n_4.c | 1423 ++++++++++++++++---------------------- kernel/zarch/zgemv_t_4.c | 1251 +++++++++++++++------------------ ztest/gemv.c | 159 +++-- 26 files changed, 2869 insertions(+), 1716 deletions(-) create mode 100644 kernel/zarch/cgemv_n_4.c create mode 100644 kernel/zarch/cgemv_t_4.c diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 073419fa8..8ed40099b 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -27,9 +27,9 @@ #include -#define CPU_GENERIC 0 -#define CPU_Z13 1 -#define CPU_Z14 2 +#define CPU_GENERIC 0 +#define CPU_Z13 1 +#define CPU_Z14 2 static char *cpuname[] = { "ZARCH_GENERIC", @@ -112,7 +112,7 @@ void get_cpuconfig(void) printf("#define Z13\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); break; - case CPU_Z14: + case CPU_Z14: printf("#define Z14\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); break; diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13 index d39b9d904..e5b974ab4 100644 --- a/kernel/zarch/KERNEL.Z13 +++ b/kernel/zarch/KERNEL.Z13 @@ -74,12 +74,12 @@ ZSWAPKERNEL = zswap.c SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = dgemv_n_4.c CGEMVNKERNEL = ../arm/zgemv_n.c -ZGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = zgemv_n_4.c SGEMVTKERNEL = ../arm/gemv_t.c DGEMVTKERNEL = dgemv_t_4.c CGEMVTKERNEL = ../arm/zgemv_t.c -ZGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = zgemv_t_4.c STRMMKERNEL = strmm8x4V.S DTRMMKERNEL = trmm8x4V.S diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14 index fa88b6881..80f78f48f 100644 --- a/kernel/zarch/KERNEL.Z14 +++ b/kernel/zarch/KERNEL.Z14 @@ -73,13 +73,13 @@ ZSWAPKERNEL = zswap.c SGEMVNKERNEL = sgemv_n_4.c DGEMVNKERNEL = dgemv_n_4.c -CGEMVNKERNEL = ../arm/zgemv_n.c -ZGEMVNKERNEL = ../arm/zgemv_n.c +CGEMVNKERNEL = cgemv_n_4.c +ZGEMVNKERNEL = zgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c DGEMVTKERNEL = dgemv_t_4.c -CGEMVTKERNEL = ../arm/zgemv_t.c -ZGEMVTKERNEL = ../arm/zgemv_t.c +CGEMVTKERNEL = cgemv_t_4.c +ZGEMVTKERNEL = zgemv_t_4.c STRMMKERNEL = strmm8x4V.S DTRMMKERNEL = trmm8x4V.S diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 6394be769..3506c4e9b 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -198,7 +198,7 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; - BLASLONG j = 0; + BLASLONG ix = 0; FLOAT maxf = 0.0; BLASLONG inc_x2; @@ -216,53 +216,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { else { maxf=CABS1(x,0); + ix += 2; i++; } while (i < n) { - if (ABS(x[i*2]) > maxf) { - maxf = ABS(x[i*2]); + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } + ix += 2; i++; } return (maxf); } else { - inc_x2 = 2 * inc_x; maxf=CABS1(x,0); - i += inc_x2; - j++; + inc_x2 = 2 * inc_x; + ix += inc_x2; + i++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while (i < n1) { - if (CABS1(x,i) > maxf) { - maxf = CABS1(x,i); + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } - if (CABS1(x,i+inc_x2) > maxf) { - maxf = CABS1(x,i+inc_x2); + if (CABS1(x,ix+inc_x2) > maxf) { + maxf = CABS1(x,ix+inc_x2); } - if (CABS1(x,i+inc_x2*2) > maxf) { - maxf = CABS1(x,i+inc_x2*2); + if (CABS1(x,ix+inc_x2*2) > maxf) { + maxf = CABS1(x,ix+inc_x2*2); } - if (CABS1(x,i+inc_x2*3) > maxf) { - maxf = CABS1(x,i+inc_x2*3); + if (CABS1(x,ix+inc_x2*3) > maxf) { + maxf = CABS1(x,ix+inc_x2*3); } - i += inc_x2 * 4; + ix += inc_x2 * 4; - j += 4; + i += 4; } - while (j < n) { - if (CABS1(x,i) > maxf) { - maxf = CABS1(x,i); + while (i < n) { + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } - i += inc_x2; - j++; + ix += inc_x2; + i++; } return (maxf); } diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 936c300c8..726747b99 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -198,7 +198,7 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; - BLASLONG j = 0; + BLASLONG ix = 0; FLOAT minf = 0.0; BLASLONG inc_x2; @@ -216,53 +216,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { else { minf=CABS1(x,0); + ix += 2; i++; } while (i < n) { - if (ABS(x[i*2]) < minf) { - minf = ABS(x[i*2]); + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } + ix += 2; i++; } return (minf); } else { - inc_x2 = 2 * inc_x; minf=CABS1(x,0); - i += inc_x2; - j++; + inc_x2 = 2 * inc_x; + ix += inc_x2; + i++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while (i < n1) { - if (CABS1(x,i) < minf) { - minf = CABS1(x,i); + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } - if (CABS1(x,i+inc_x2) < minf) { - minf = CABS1(x,i+inc_x2); + if (CABS1(x,ix+inc_x2) < minf) { + minf = CABS1(x,ix+inc_x2); } - if (CABS1(x,i+inc_x2*2) < minf) { - minf = CABS1(x,i+inc_x2*2); + if (CABS1(x,ix+inc_x2*2) < minf) { + minf = CABS1(x,ix+inc_x2*2); } - if (CABS1(x,i+inc_x2*3) < minf) { - minf = CABS1(x,i+inc_x2*3); + if (CABS1(x,ix+inc_x2*3) < minf) { + minf = CABS1(x,ix+inc_x2*3); } - i += inc_x2 * 4; + ix += inc_x2 * 4; - j += 4; + i += 4; } - while (j < n) { - if (CABS1(x,i) < minf) { - minf = CABS1(x,i); + while (i < n) { + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } - i += inc_x2; - j++; + ix += inc_x2; + i++; } return (minf); } diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c index 2176f3dcd..fe5568cc8 100644 --- a/kernel/zarch/caxpy.c +++ b/kernel/zarch/caxpy.c @@ -110,7 +110,7 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "agfi %%r1,128 \n\t" "brctg %%r0,0b " : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha) :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); } @@ -118,7 +118,7 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0; BLASLONG ix = 0, iy = 0; - FLOAT da[2]; + FLOAT da[2] __attribute__ ((aligned(16))); if (n <= 0) return (0); diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c new file mode 100644 index 000000000..4c3253774 --- /dev/null +++ b/kernel/zarch/cgemv_n_4.c @@ -0,0 +1,743 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include "common.h" + +#define NBMAX 1024 + +static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vlrepg %%v16,0(%5) \n\t" + "vlrepg %%v17,8(%5) \n\t" + "vlrepg %%v18,16(%5) \n\t" + "vlrepg %%v19,24(%5) \n\t" +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vlef %%v20,4(%5),0 \n\t" + "vlef %%v20,4(%5),2 \n\t" + "vflcsb %%v20,%%v20 \n\t" + "vlef %%v20,0(%5),1 \n\t" + "vlef %%v20,0(%5),3 \n\t" + + "vlef %%v21,12(%5),0 \n\t" + "vlef %%v21,12(%5),2 \n\t" + "vflcsb %%v21,%%v21 \n\t" + "vlef %%v21,8(%5),1 \n\t" + "vlef %%v21,8(%5),3 \n\t" + + "vlef %%v22,20(%5),0 \n\t" + "vlef %%v22,20(%5),2 \n\t" + "vflcsb %%v22,%%v22 \n\t" + "vlef %%v22,16(%5),1 \n\t" + "vlef %%v22,16(%5),3 \n\t" + + "vlef %%v23,28(%5),0 \n\t" + "vlef %%v23,28(%5),2 \n\t" + "vflcsb %%v23,%%v23 \n\t" + "vlef %%v23,24(%5),1 \n\t" + "vlef %%v23,24(%5),3 \n\t" +#else + "vlef %%v20,0(%5),1 \n\t" + "vlef %%v20,0(%5),3 \n\t" + "vflcsb %%v20,%%v20 \n\t" + "vlef %%v20,4(%5),0 \n\t" + "vlef %%v20,4(%5),2 \n\t" + + "vlef %%v21,8(%5),1 \n\t" + "vlef %%v21,8(%5),3 \n\t" + "vflcsb %%v21,%%v21 \n\t" + "vlef %%v21,12(%5),0 \n\t" + "vlef %%v21,12(%5),2 \n\t" + + "vlef %%v22,16(%5),1 \n\t" + "vlef %%v22,16(%5),3 \n\t" + "vflcsb %%v22,%%v22 \n\t" + "vlef %%v22,20(%5),0 \n\t" + "vlef %%v22,20(%5),2 \n\t" + + "vlef %%v23,24(%5),1 \n\t" + "vlef %%v23,24(%5),3 \n\t" + "vflcsb %%v23,%%v23 \n\t" + "vlef %%v23,28(%5),0 \n\t" + "vlef %%v23,28(%5),2 \n\t" +#endif + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%%r0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" + + "vlef %%v24,0(%%r1,%1),0 \n\t" + "vlef %%v24,0(%%r1,%1),1 \n\t" + "vlef %%v24,8(%%r1,%1),2 \n\t" + "vlef %%v24,8(%%r1,%1),3 \n\t" + "vlef %%v25,4(%%r1,%1),0 \n\t" + "vlef %%v25,4(%%r1,%1),1 \n\t" + "vlef %%v25,12(%%r1,%1),2 \n\t" + "vlef %%v25,12(%%r1,%1),3 \n\t" + "vlef %%v26,0(%%r1,%2),0 \n\t" + "vlef %%v26,0(%%r1,%2),1 \n\t" + "vlef %%v26,8(%%r1,%2),2 \n\t" + "vlef %%v26,8(%%r1,%2),3 \n\t" + "vlef %%v27,4(%%r1,%2),0 \n\t" + "vlef %%v27,4(%%r1,%2),1 \n\t" + "vlef %%v27,12(%%r1,%2),2 \n\t" + "vlef %%v27,12(%%r1,%2),3 \n\t" + + "vl %%v0,0(%%r1,%6) \n\t" + "vfmasb %%v0,%%v24,%%v16,%%v0 \n\t" + "vfmasb %%v0,%%v25,%%v20,%%v0 \n\t" + "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" + "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" + + "vlef %%v28,0(%%r1,%1),0 \n\t" + "vlef %%v28,0(%%r1,%1),1 \n\t" + "vlef %%v28,8(%%r1,%1),2 \n\t" + "vlef %%v28,8(%%r1,%1),3 \n\t" + "vlef %%v29,4(%%r1,%1),0 \n\t" + "vlef %%v29,4(%%r1,%1),1 \n\t" + "vlef %%v29,12(%%r1,%1),2 \n\t" + "vlef %%v29,12(%%r1,%1),3 \n\t" + "vlef %%v30,0(%%r1,%2),0 \n\t" + "vlef %%v30,0(%%r1,%2),1 \n\t" + "vlef %%v30,8(%%r1,%2),2 \n\t" + "vlef %%v30,8(%%r1,%2),3 \n\t" + "vlef %%v31,4(%%r1,%2),0 \n\t" + "vlef %%v31,4(%%r1,%2),1 \n\t" + "vlef %%v31,12(%%r1,%2),2 \n\t" + "vlef %%v31,12(%%r1,%2),3 \n\t" + + "vfmasb %%v0,%%v28,%%v18,%%v0 \n\t" + "vfmasb %%v0,%%v29,%%v22,%%v0 \n\t" + "vfmasb %%v0,%%v30,%%v19,%%v0 \n\t" + "vfmasb %%v0,%%v31,%%v23,%%v0 \n\t" + "vst %%v0,0(%%r1,%6) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,0b \n\t" + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vlrepg %%v16,0(%3) \n\t" + "vlrepg %%v17,8(%3) \n\t" +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vlef %%v18,4(%3),0 \n\t" + "vlef %%v18,4(%3),2 \n\t" + "vflcsb %%v18,%%v18 \n\t" + "vlef %%v18,0(%3),1 \n\t" + "vlef %%v18,0(%3),3 \n\t" + + "vlef %%v19,12(%3),0 \n\t" + "vlef %%v19,12(%3),2 \n\t" + "vflcsb %%v19,%%v19 \n\t" + "vlef %%v19,8(%3),1 \n\t" + "vlef %%v19,8(%3),3 \n\t" +#else + "vlef %%v18,0(%3),1 \n\t" + "vlef %%v18,0(%3),3 \n\t" + "vflcsb %%v18,%%v18 \n\t" + "vlef %%v18,4(%3),0 \n\t" + "vlef %%v18,4(%3),2 \n\t" + + "vlef %%v19,8(%3),1 \n\t" + "vlef %%v19,8(%3),3 \n\t" + "vflcsb %%v19,%%v19 \n\t" + "vlef %%v19,12(%3),0 \n\t" + "vlef %%v19,12(%3),2 \n\t" +#endif + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%%r0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" + + "vlef %%v20,0(%%r1,%1),0 \n\t" + "vlef %%v20,0(%%r1,%1),1 \n\t" + "vlef %%v20,8(%%r1,%1),2 \n\t" + "vlef %%v20,8(%%r1,%1),3 \n\t" + "vlef %%v21,4(%%r1,%1),0 \n\t" + "vlef %%v21,4(%%r1,%1),1 \n\t" + "vlef %%v21,12(%%r1,%1),2 \n\t" + "vlef %%v21,12(%%r1,%1),3 \n\t" + "vlef %%v22,0(%%r1,%2),0 \n\t" + "vlef %%v22,0(%%r1,%2),1 \n\t" + "vlef %%v22,8(%%r1,%2),2 \n\t" + "vlef %%v22,8(%%r1,%2),3 \n\t" + "vlef %%v23,4(%%r1,%2),0 \n\t" + "vlef %%v23,4(%%r1,%2),1 \n\t" + "vlef %%v23,12(%%r1,%2),2 \n\t" + "vlef %%v23,12(%%r1,%2),3 \n\t" + + "vl %%v0,0(%%r1,%4) \n\t" + "vfmasb %%v0,%%v20,%%v16,%%v0 \n\t" + "vfmasb %%v0,%%v21,%%v18,%%v0 \n\t" + "vfmasb %%v0,%%v22,%%v17,%%v0 \n\t" + "vfmasb %%v0,%%v23,%%v19,%%v0 \n\t" + "vst %%v0,0(%%r1,%4) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,0b \n\t" + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); +} + +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vlrepg %%v16,0(%2) \n\t" +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vlef %%v17,4(%2),0 \n\t" + "vlef %%v17,4(%2),2 \n\t" + "vflcsb %%v17,%%v17 \n\t" + "vlef %%v17,0(%2),1 \n\t" + "vlef %%v17,0(%2),3 \n\t" +#else + "vlef %%v17,0(%2),1 \n\t" + "vlef %%v17,0(%2),3 \n\t" + "vflcsb %%v17,%%v17 \n\t" + "vlef %%v17,4(%2),0 \n\t" + "vlef %%v17,4(%2),2 \n\t" +#endif + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%%r0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vlef %%v18,0(%%r1,%1),0 \n\t" + "vlef %%v18,0(%%r1,%1),1 \n\t" + "vlef %%v18,8(%%r1,%1),2 \n\t" + "vlef %%v18,8(%%r1,%1),3 \n\t" + "vlef %%v19,4(%%r1,%1),0 \n\t" + "vlef %%v19,4(%%r1,%1),1 \n\t" + "vlef %%v19,12(%%r1,%1),2 \n\t" + "vlef %%v19,12(%%r1,%1),3 \n\t" + + "vl %%v0,0(%%r1,%3) \n\t" + "vfmasb %%v0,%%v18,%%v16,%%v0 \n\t" + "vfmasb %%v0,%%v19,%%v17,%%v0 \n\t" + "vst %%v0,0(%%r1,%3) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,0b \n\t" + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19" + ); +} + +static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) +{ + __asm__ volatile ( +#if !defined(XCONJ) + "vlrepf %%v0,%3 \n\t" + "vlef %%v1,%4,0 \n\t" + "vlef %%v1,%4,2 \n\t" + "vflcsb %%v1,%%v1 \n\t" + "vlef %%v1,%4,1 \n\t" + "vlef %%v1,%4,3 \n\t" +#else + "vlef %%v0,%3,1 \n\t" + "vlef %%v0,%3,3 \n\t" + "vflcsb %%v0,%%v0 \n\t" + "vlef %%v0,%3,0 \n\t" + "vlef %%v0,%3,2 \n\t" + "vlrepf %%v1,%4 \n\t" +#endif + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,2 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,0(%%r1,%2) \n\t" + "vl %%v19,16(%%r1,%2) \n\t" + "verllg %%v20,%%v16,32 \n\t" + "verllg %%v21,%%v17,32 \n\t" + + "vfmasb %%v22,%%v16,%%v0,%%v18 \n\t" + "vfmasb %%v23,%%v17,%%v0,%%v19 \n\t" + + "vfmasb %%v22,%%v20,%%v1,%%v22 \n\t" + "vfmasb %%v23,%%v21,%%v1,%%v23 \n\t" + + "vst %%v22,0(%%r1,%2) \n\t" + "vst %%v23,16(%%r1,%2) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23" + ); +} + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) +{ + BLASLONG i; + + if ( inc_dest != 2 ) + { + + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m - m3; + m2 = (m & (NBMAX-1)) - m3 ; + + alpha[0] = alpha_r; + alpha[1] = alpha_i; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + if ( inc_x != 2 ) + copy_x(NB,x_ptr,xbuffer,inc_x); + else + xbuffer = x_ptr; + + if ( inc_y == 2 ) + { + + for( i = 0; i < n1 ; i++) + { + cgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + y_ptr += 8; + + } + + if ( n2 & 2 ) + { + cgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); + a_ptr += lda * 2; + y_ptr += 4; + + } + + if ( n2 & 1 ) + { + cgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); + /* a_ptr += lda; + y_ptr += 2; */ + + } + + } + else + { + + for( i = 0; i < n1 ; i++) + { + memset(ybuffer,0,sizeof(ybuffer)); + cgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for( i = 0; i < n2 ; i++) + { + memset(ybuffer,0,sizeof(ybuffer)); + cgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + + + if ( m3 == 0 ) return(0); + + x_ptr = x; + j=0; + a_ptr = a; + y_ptr = y; + + if ( m3 == 3 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + + + if ( m3 == 2 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } + + + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + + return(0); + } + + + if ( m3 == 1 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } + + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + + return(0); +} diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index e7f096e0d..9b4077c6b 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -281,6 +281,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; } + else + { + maxf = CABS1(x,0); + ix += 2; + i++; + } while(i < n) { @@ -296,9 +302,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - inc_x2 = 2 * inc_x; - maxf = CABS1(x,0); + inc_x2 = 2 * inc_x; ix += inc_x2; i++; diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index b9c1ccd9c..6e952a325 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -281,6 +281,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; } + else + { + minf = CABS1(x,0); + ix += 2; + i++; + } while(i < n) { @@ -296,9 +302,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - inc_x2 = 2 * inc_x; - minf = CABS1(x,0); + inc_x2 = 2 * inc_x; ix += inc_x2; i++; diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index aba880949..d1f135369 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -204,6 +204,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + maxf = ABS(x[0]); + i++; + } while (i < n) { if (ABS(x[i]) > maxf) { @@ -216,7 +221,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + maxf = ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (ABS(x[i]) > maxf) { diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 3213efa4d..679606a8f 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -204,6 +204,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + minf = ABS(x[0]); + i++; + } while (i < n) { if (ABS(x[i]) < minf) { @@ -216,7 +221,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + minf = ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (ABS(x[i]) < minf) { diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 26fff4eb0..5de41ac7b 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -180,6 +180,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + maxf = x[0]; + i++; + } while (i < n) { if (x[i] > maxf) { @@ -192,7 +197,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + maxf = x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (x[i] > maxf) { diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index 570b33a15..7fec111cf 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -180,6 +180,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + minf = x[0]; + i++; + } while (i < n) { if (x[i] < minf) { @@ -192,7 +197,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + minf = x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (x[i] < minf) { diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index 95a665b10..d2686c0cd 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -247,6 +247,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + maxf = ABS(x[0]); + i++; + } while (i < n) { if (ABS(x[i]) > maxf) { @@ -259,7 +264,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + maxf = ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (ABS(x[i]) > maxf) { diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 640fc02c9..768f31a8c 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -247,6 +247,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + minf = ABS(x[0]); + i++; + } while (i < n) { if (ABS(x[i]) < minf) { @@ -259,7 +264,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + minf = ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (ABS(x[i]) < minf) { diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 0eb350315..8fc32adf6 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -223,6 +223,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + maxf = x[0]; + i++; + } while (i < n) { if (x[i] > maxf) { @@ -235,7 +240,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + maxf = x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (x[i] > maxf) { diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index f050db8cb..415052810 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -223,6 +223,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + minf = x[0]; + i++; + } while (i < n) { if (x[i] < minf) { @@ -235,7 +240,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + minf = x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (x[i] < minf) { diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index bf5f621a7..541464b05 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -202,6 +202,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; } + else + { + maxf = CABS1(x,0); + ix += 2; + i++; + } while(i < n) { @@ -217,9 +223,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - inc_x2 = 2 * inc_x; - maxf = CABS1(x,0); + inc_x2 = 2 * inc_x; ix += inc_x2; i++; diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 3636e8fdf..4b5572b80 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -202,6 +202,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; } + else + { + minf = CABS1(x,0); + ix += 2; + i++; + } while(i < n) { @@ -217,9 +223,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - inc_x2 = 2 * inc_x; - minf = CABS1(x,0); + inc_x2 = 2 * inc_x; ix += inc_x2; i++; diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index 6393b099b..937bc9753 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -150,7 +150,7 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; - BLASLONG j = 0; + BLASLONG ix = 0; FLOAT maxf = 0.0; BLASLONG inc_x2; @@ -168,53 +168,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { else { maxf=CABS1(x,0); + ix += 2; i++; } while (i < n) { - if (ABS(x[i*2]) > maxf) { - maxf = ABS(x[i*2]); + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } + ix += 2; i++; } return (maxf); } else { - - inc_x2 = 2 * inc_x; + maxf=CABS1(x,0); - i += inc_x2; - j++; + inc_x2 = 2 * inc_x; + ix += inc_x2; + i++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while (i < n1) { - if (CABS1(x,i) > maxf) { - maxf = CABS1(x,i); + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } - if (CABS1(x,i+inc_x2) > maxf) { - maxf = CABS1(x,i+inc_x2); + if (CABS1(x,ix+inc_x2) > maxf) { + maxf = CABS1(x,ix+inc_x2); } - if (CABS1(x,i+inc_x2*2) > maxf) { - maxf = CABS1(x,i+inc_x2*2); + if (CABS1(x,ix+inc_x2*2) > maxf) { + maxf = CABS1(x,ix+inc_x2*2); } - if (CABS1(x,i+inc_x2*3) > maxf) { - maxf = CABS1(x,i+inc_x2*3); + if (CABS1(x,ix+inc_x2*3) > maxf) { + maxf = CABS1(x,ix+inc_x2*3); } - i += inc_x2 * 4; + ix += inc_x2 * 4; - j += 4; + i += 4; } - while (j < n) { - if (CABS1(x,i) > maxf) { - maxf = CABS1(x,i); + while (i < n) { + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } - i += inc_x2; - j++; + ix += inc_x2; + i++; } return (maxf); } diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index b15774bb9..8564edaf4 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -150,7 +150,7 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; - BLASLONG j = 0; + BLASLONG ix = 0; FLOAT minf = 0.0; BLASLONG inc_x2; @@ -168,53 +168,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { else { minf=CABS1(x,0); + ix += 2; i++; } while (i < n) { - if (ABS(x[i*2]) < minf) { - minf = ABS(x[i*2]); + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } + ix += 2; i++; } return (minf); } else { - inc_x2 = 2 * inc_x; minf=CABS1(x,0); - i += inc_x2; - j++; + inc_x2 = 2 * inc_x; + ix += inc_x2; + i++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while (i < n1) { - if (CABS1(x,i) < minf) { - minf = CABS1(x,i); + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } - if (CABS1(x,i+inc_x2) < minf) { - minf = CABS1(x,i+inc_x2); + if (CABS1(x,ix+inc_x2) < minf) { + minf = CABS1(x,ix+inc_x2); } - if (CABS1(x,i+inc_x2*2) < minf) { - minf = CABS1(x,i+inc_x2*2); + if (CABS1(x,ix+inc_x2*2) < minf) { + minf = CABS1(x,ix+inc_x2*2); } - if (CABS1(x,i+inc_x2*3) < minf) { - minf = CABS1(x,i+inc_x2*3); + if (CABS1(x,ix+inc_x2*3) < minf) { + minf = CABS1(x,ix+inc_x2*3); } - i += inc_x2 * 4; + ix += inc_x2 * 4; - j += 4; + i += 4; } - while (j < n) { - if (CABS1(x,i) < minf) { - minf = CABS1(x,i); + while (i < n) { + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } - i += inc_x2; - j++; + ix += inc_x2; + i++; } return (minf); } diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 6ba44a27c..f0e993d2f 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -106,7 +106,7 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "agfi %%r1,128 \n\t" "brctg %%r0,0b " : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha) :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); } @@ -114,7 +114,7 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0; BLASLONG ix = 0, iy = 0; - FLOAT da[2]; + FLOAT da[2] __attribute__ ((aligned(16))); if (n <= 0) return (0); diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 484db3073..9472b5d5a 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2018, The OpenBLAS Project +Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -23,898 +23,693 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ #include #include #include "common.h" -#define HAVE_KERNEL_4x4_VEC 1 -#define HAVE_KERNEL_4x2_VEC 1 -#define HAVE_KERNEL_4x1_VEC 1 -#define HAVE_KERNEL_ADDY 1 - -#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) -#include -#endif - -// #define NBMAX 1024 -#ifdef HAVE_KERNEL_4x4_VEC_ASM - -#elif HAVE_KERNEL_4x4_VEC - -static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - +static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vl %%v16,0(%5) \n\t" + "vl %%v17,16(%5) \n\t" + "vl %%v18,32(%5) \n\t" + "vl %%v19,48(%5) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register __vector double vx0_r = {x[0], x[0]}; - register __vector double vx0_i = {-x[1], x[1]}; - register __vector double vx1_r = {x[2], x[2]}; - register __vector double vx1_i = {-x[3], x[3]}; - register __vector double vx2_r = {x[4], x[4]}; - register __vector double vx2_i = {-x[5], x[5]}; - register __vector double vx3_r = {x[6], x[6]}; - register __vector double vx3_i = {-x[7], x[7]}; - + "vleg %%v20,8(%5),0 \n\t" + "wflcdb %%v20,%%v20 \n\t" + "vleg %%v20,0(%5),1 \n\t" + "vleg %%v21,24(%5),0 \n\t" + "wflcdb %%v21,%%v21 \n\t" + "vleg %%v21,16(%5),1 \n\t" + "vleg %%v22,40(%5),0 \n\t" + "wflcdb %%v22,%%v22 \n\t" + "vleg %%v22,32(%5),1 \n\t" + "vleg %%v23,56(%5),0 \n\t" + "wflcdb %%v23,%%v23 \n\t" + "vleg %%v23,48(%5),1 \n\t" #else - register __vector double vx0_r = {x[0], -x[0]}; - register __vector double vx0_i = {x[1], x[1]}; - register __vector double vx1_r = {x[2], -x[2]}; - register __vector double vx1_i = {x[3], x[3]}; - register __vector double vx2_r = {x[4], -x[4]}; - register __vector double vx2_i = {x[5], x[5]}; - register __vector double vx3_r = {x[6], -x[6]}; - register __vector double vx3_i = {x[7], x[7]}; + "vleg %%v20,0(%5),1 \n\t" + "vflcdb %%v20,%%v20 \n\t" + "vleg %%v20,8(%5),0 \n\t" + "vleg %%v21,16(%5),1 \n\t" + "vflcdb %%v21,%%v21 \n\t" + "vleg %%v21,24(%5),0 \n\t" + "vleg %%v22,32(%5),1 \n\t" + "vflcdb %%v22,%%v22 \n\t" + "vleg %%v22,40(%5),0 \n\t" + "vleg %%v23,48(%5),1 \n\t" + "vflcdb %%v23,%%v23 \n\t" + "vleg %%v23,56(%5),0 \n\t" #endif + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" - register __vector double *vy = (__vector double *) y; - register __vector double *vptr_a0 = (__vector double *) a0; - register __vector double *vptr_a1 = (__vector double *) a1; - register __vector double *vptr_a2 = (__vector double *) a2; - register __vector double *vptr_a3 = (__vector double *) a3; + "vlrepg %%v24,0(%%r1,%1) \n\t" + "vlrepg %%v25,8(%%r1,%1) \n\t" + "vlrepg %%v26,0(%%r1,%2) \n\t" + "vlrepg %%v27,8(%%r1,%2) \n\t" + + "vl %%v0,0(%%r1,%6) \n\t" + "vfmadb %%v0,%%v24,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v25,%%v20,%%v0 \n\t" + "vfmadb %%v0,%%v26,%%v17,%%v0 \n\t" + "vfmadb %%v0,%%v27,%%v21,%%v0 \n\t" - for (i = 0; i < n; i += 4) { + "vlrepg %%v28,0(%%r1,%3) \n\t" + "vlrepg %%v29,8(%%r1,%3) \n\t" + "vlrepg %%v30,0(%%r1,%4) \n\t" + "vlrepg %%v31,8(%%r1,%4) \n\t" + + "vfmadb %%v0,%%v28,%%v18,%%v0 \n\t" + "vfmadb %%v0,%%v29,%%v22,%%v0 \n\t" + "vfmadb %%v0,%%v30,%%v19,%%v0 \n\t" + "vfmadb %%v0,%%v31,%%v23,%%v0 \n\t" + "vst %%v0,0(%%r1,%6) \n\t" - register __vector double vy_0 = vy[i]; - register __vector double vy_1 = vy[i + 1]; - register __vector double vy_2 = vy[i + 2]; - register __vector double vy_3 = vy[i + 3]; + "vlrepg %%v24,16(%%r1,%1) \n\t" + "vlrepg %%v25,24(%%r1,%1) \n\t" + "vlrepg %%v26,16(%%r1,%2) \n\t" + "vlrepg %%v27,24(%%r1,%2) \n\t" + + "vl %%v0,16(%%r1,%6) \n\t" + "vfmadb %%v0,%%v24,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v25,%%v20,%%v0 \n\t" + "vfmadb %%v0,%%v26,%%v17,%%v0 \n\t" + "vfmadb %%v0,%%v27,%%v21,%%v0 \n\t" - register __vector double va0 = vptr_a0[i]; - register __vector double va0_1 = vptr_a0[i + 1]; - register __vector double va0_2 = vptr_a0[i + 2]; - register __vector double va0_3 = vptr_a0[i + 3]; - - register __vector double va1 = vptr_a1[i]; - register __vector double va1_1 = vptr_a1[i + 1]; - register __vector double va1_2 = vptr_a1[i + 2]; - register __vector double va1_3 = vptr_a1[i + 3]; - - register __vector double va2 = vptr_a2[i]; - register __vector double va2_1 = vptr_a2[i + 1]; - register __vector double va2_2 = vptr_a2[i + 2]; - register __vector double va2_3 = vptr_a2[i + 3]; - - register __vector double va3 = vptr_a3[i]; - register __vector double va3_1 = vptr_a3[i + 1]; - register __vector double va3_2 = vptr_a3[i + 2]; - register __vector double va3_3 = vptr_a3[i + 3]; - - vy_0 += va0*vx0_r; - vy_1 += va0_1*vx0_r; - vy_2 += va0_2*vx0_r; - vy_3 += va0_3*vx0_r; - - vy_0 += va1*vx1_r; - vy_1 += va1_1*vx1_r; - vy_2 += va1_2*vx1_r; - vy_3 += va1_3*vx1_r; - - va0 = vec_permi(va0, va0, 2); - va0_1 = vec_permi(va0_1, va0_1, 2); - va0_2 = vec_permi(va0_2, va0_2, 2); - va0_3 = vec_permi(va0_3, va0_3, 2); - - vy_0 += va2*vx2_r; - vy_1 += va2_1*vx2_r; - vy_2 += va2_2*vx2_r; - vy_3 += va2_3*vx2_r; - - va1 = vec_permi(va1, va1, 2); - va1_1 = vec_permi(va1_1, va1_1, 2); - va1_2 = vec_permi(va1_2, va1_2, 2); - va1_3 = vec_permi(va1_3, va1_3, 2); - - vy_0 += va3*vx3_r; - vy_1 += va3_1*vx3_r; - vy_2 += va3_2*vx3_r; - vy_3 += va3_3*vx3_r; - - va2 = vec_permi(va2, va2, 2); - va2_1 = vec_permi(va2_1, va2_1, 2); - va2_2 = vec_permi(va2_2, va2_2, 2); - va2_3 = vec_permi(va2_3, va2_3, 2); - - vy_0 += va0*vx0_i; - vy_1 += va0_1*vx0_i; - vy_2 += va0_2*vx0_i; - vy_3 += va0_3*vx0_i; - - va3 = vec_permi(va3, va3, 2); - va3_1 = vec_permi(va3_1, va3_1, 2); - va3_2 = vec_permi(va3_2, va3_2, 2); - va3_3 = vec_permi(va3_3, va3_3, 2); - - vy_0 += va1*vx1_i; - vy_1 += va1_1*vx1_i; - vy_2 += va1_2*vx1_i; - vy_3 += va1_3*vx1_i; - - vy_0 += va2*vx2_i; - vy_1 += va2_1*vx2_i; - vy_2 += va2_2*vx2_i; - vy_3 += va2_3*vx2_i; - - vy_0 += va3*vx3_i; - vy_1 += va3_1*vx3_i; - vy_2 += va3_2*vx3_i; - vy_3 += va3_3*vx3_i; - - vy[i] = vy_0; - vy[i + 1] = vy_1; - vy[i + 2] = vy_2; - vy[i + 3] = vy_3; - - } + "vlrepg %%v28,16(%%r1,%3) \n\t" + "vlrepg %%v29,24(%%r1,%3) \n\t" + "vlrepg %%v30,16(%%r1,%4) \n\t" + "vlrepg %%v31,24(%%r1,%4) \n\t" + + "vfmadb %%v0,%%v28,%%v18,%%v0 \n\t" + "vfmadb %%v0,%%v29,%%v22,%%v0 \n\t" + "vfmadb %%v0,%%v30,%%v19,%%v0 \n\t" + "vfmadb %%v0,%%v31,%%v23,%%v0 \n\t" + "vst %%v0,16(%%r1,%6) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else -static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - - for (i = 0; i < 2 * n; i += 2) { +static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vl %%v16,0(%3) \n\t" + "vl %%v17,16(%3) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; - y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; - y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; - y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; - y[i] += a2[i] * x[4] - a2[i + 1] * x[5]; - y[i + 1] += a2[i] * x[5] + a2[i + 1] * x[4]; - y[i] += a3[i] * x[6] - a3[i + 1] * x[7]; - y[i + 1] += a3[i] * x[7] + a3[i + 1] * x[6]; -#else - y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; - y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; - y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; - y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; - y[i] += a2[i] * x[4] + a2[i + 1] * x[5]; - y[i + 1] += a2[i] * x[5] - a2[i + 1] * x[4]; - y[i] += a3[i] * x[6] + a3[i + 1] * x[7]; - y[i + 1] += a3[i] * x[7] - a3[i + 1] * x[6]; + "vleg %%v18,8(%3),0 \n\t" + "wflcdb %%v18,%%v18 \n\t" + "vleg %%v18,0(%3),1 \n\t" + "vleg %%v19,24(%3),0 \n\t" + "wflcdb %%v19,%%v19 \n\t" + "vleg %%v19,16(%3),1 \n\t" +#else + "vleg %%v18,0(%3),1 \n\t" + "vflcdb %%v18,%%v18 \n\t" + "vleg %%v18,8(%3),0 \n\t" + "vleg %%v19,16(%3),1 \n\t" + "vflcdb %%v19,%%v19 \n\t" + "vleg %%v19,24(%3),0 \n\t" #endif - } + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" + + "vlrepg %%v20,0(%%r1,%1) \n\t" + "vlrepg %%v21,8(%%r1,%1) \n\t" + "vlrepg %%v22,0(%%r1,%2) \n\t" + "vlrepg %%v23,8(%%r1,%2) \n\t" + + "vl %%v0,0(%%r1,%4) \n\t" + "vfmadb %%v0,%%v20,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v21,%%v18,%%v0 \n\t" + "vfmadb %%v0,%%v22,%%v17,%%v0 \n\t" + "vfmadb %%v0,%%v23,%%v19,%%v0 \n\t" + "vst %%v0,0(%%r1,%4) \n\t" + + "vlrepg %%v20,16(%%r1,%1) \n\t" + "vlrepg %%v21,24(%%r1,%1) \n\t" + "vlrepg %%v22,16(%%r1,%2) \n\t" + "vlrepg %%v23,24(%%r1,%2) \n\t" + + "vl %%v0,16(%%r1,%4) \n\t" + "vfmadb %%v0,%%v20,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v21,%%v18,%%v0 \n\t" + "vfmadb %%v0,%%v22,%%v17,%%v0 \n\t" + "vfmadb %%v0,%%v23,%%v19,%%v0 \n\t" + "vst %%v0,16(%%r1,%4) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); } -#endif - -#ifdef HAVE_KERNEL_4x2_VEC - -static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - - +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vl %%v16,0(%2) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register __vector double vx0_r = {x[0], x[0]}; - register __vector double vx0_i = {-x[1], x[1]}; - register __vector double vx1_r = {x[2], x[2]}; - register __vector double vx1_i = {-x[3], x[3]}; - + "vleg %%v17,8(%2),0 \n\t" + "wflcdb %%v17,%%v17 \n\t" + "vleg %%v17,0(%2),1 \n\t" #else - register __vector double vx0_r = {x[0], -x[0]}; - register __vector double vx0_i = {x[1], x[1]}; - register __vector double vx1_r = {x[2], -x[2]}; - register __vector double vx1_i = {x[3], x[3]}; + "vleg %%v17,0(%2),1 \n\t" + "vflcdb %%v17,%%v17 \n\t" + "vleg %%v17,8(%2),0 \n\t" #endif + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + "vlrepg %%v18,0(%%r1,%1) \n\t" + "vlrepg %%v19,8(%%r1,%1) \n\t" + + "vl %%v0,0(%%r1,%3) \n\t" + "vfmadb %%v0,%%v18,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v19,%%v17,%%v0 \n\t" + "vst %%v0,0(%%r1,%3) \n\t" - register __vector double *vy = (__vector double *) y; - register __vector double *vptr_a0 = (__vector double *) a0; - register __vector double *vptr_a1 = (__vector double *) a1; - - for (i = 0; i < n; i += 4) { - - register __vector double vy_0 = vy[i]; - register __vector double vy_1 = vy[i + 1]; - register __vector double vy_2 = vy[i + 2]; - register __vector double vy_3 = vy[i + 3]; - - register __vector double va0 = vptr_a0[i]; - register __vector double va0_1 = vptr_a0[i + 1]; - register __vector double va0_2 = vptr_a0[i + 2]; - register __vector double va0_3 = vptr_a0[i + 3]; - - register __vector double va1 = vptr_a1[i]; - register __vector double va1_1 = vptr_a1[i + 1]; - register __vector double va1_2 = vptr_a1[i + 2]; - register __vector double va1_3 = vptr_a1[i + 3]; - - vy_0 += va0*vx0_r; - vy_1 += va0_1*vx0_r; - vy_2 += va0_2*vx0_r; - vy_3 += va0_3*vx0_r; - - va0 = vec_permi(va0, va0, 2); - va0_1 = vec_permi(va0_1, va0_1, 2); - va0_2 = vec_permi(va0_2, va0_2, 2); - va0_3 = vec_permi(va0_3, va0_3, 2); - - vy_0 += va1*vx1_r; - vy_1 += va1_1*vx1_r; - vy_2 += va1_2*vx1_r; - vy_3 += va1_3*vx1_r; - - va1 = vec_permi(va1, va1, 2); - va1_1 = vec_permi(va1_1, va1_1, 2); - va1_2 = vec_permi(va1_2, va1_2, 2); - va1_3 = vec_permi(va1_3, va1_3, 2); - - vy_0 += va0*vx0_i; - vy_1 += va0_1*vx0_i; - vy_2 += va0_2*vx0_i; - vy_3 += va0_3*vx0_i; - - vy_0 += va1*vx1_i; - vy_1 += va1_1*vx1_i; - vy_2 += va1_2*vx1_i; - vy_3 += va1_3*vx1_i; - - vy[i] = vy_0; - vy[i + 1] = vy_1; - vy[i + 2] = vy_2; - vy[i + 3] = vy_3; - - } -} -#else - -static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - - for (i = 0; i < 2 * n; i += 2) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; - y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; - y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; - y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; -#else - y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; - y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; - y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; - y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; -#endif - } + "vlrepg %%v18,16(%%r1,%1) \n\t" + "vlrepg %%v19,24(%%r1,%1) \n\t" + + "vl %%v0,16(%%r1,%3) \n\t" + "vfmadb %%v0,%%v18,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v19,%%v17,%%v0 \n\t" + "vst %%v0,16(%%r1,%3) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19" + ); } -#endif - -#ifdef HAVE_KERNEL_4x1_VEC - -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - FLOAT *a0; - a0 = ap; - - -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register __vector double vx0_r = {x[0], x[0]}; - register __vector double vx0_i = {-x[1], x[1]}; - -#else - register __vector double vx0_r = {x[0], -x[0]}; - register __vector double vx0_i = {x[1], x[1]}; -#endif - - - register __vector double *vy = (__vector double *) y; - register __vector double *vptr_a0 = (__vector double *) a0; - - for (i = 0; i < n; i += 4) { - - register __vector double vy_0 = vy[i]; - register __vector double vy_1 = vy[i + 1]; - register __vector double vy_2 = vy[i + 2]; - register __vector double vy_3 = vy[i + 3]; - - register __vector double va0 = vptr_a0[i]; - register __vector double va0_1 = vptr_a0[i + 1]; - register __vector double va0_2 = vptr_a0[i + 2]; - register __vector double va0_3 = vptr_a0[i + 3]; - - vy_0 += va0*vx0_r; - vy_1 += va0_1*vx0_r; - vy_2 += va0_2*vx0_r; - vy_3 += va0_3*vx0_r; - - va0 = vec_permi(va0, va0, 2); - va0_1 = vec_permi(va0_1, va0_1, 2); - va0_2 = vec_permi(va0_2, va0_2, 2); - va0_3 = vec_permi(va0_3, va0_3, 2); - - vy_0 += va0*vx0_i; - vy_1 += va0_1*vx0_i; - vy_2 += va0_2*vx0_i; - vy_3 += va0_3*vx0_i; - - vy[i] = vy_0; - vy[i + 1] = vy_1; - vy[i + 2] = vy_2; - vy[i + 3] = vy_3; - - } -} - -#else - -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - FLOAT *a0; - a0 = ap; - - for (i = 0; i < 2 * n; i += 2) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; - y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; -#else - y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; - y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; -#endif - - } -} - -#endif - -#ifdef HAVE_KERNEL_ADDY - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - - -#if !defined(XCONJ) - - register __vector double valpha_r = {alpha_r, alpha_r}; - register __vector double valpha_i = {-alpha_i, alpha_i}; - -#else - register __vector double valpha_r = {alpha_r, -alpha_r}; - register __vector double valpha_i = {alpha_i, alpha_i}; -#endif - - register __vector double *vptr_src = (__vector double *) src; - if (inc_dest != 2) { - register __vector double *vptr_y = (__vector double *) dest; - //note that inc_dest is already 2x. so we should add it to double* - register __vector double *vptr_y1 = (__vector double *) (dest + inc_dest); - register __vector double *vptr_y2 = (__vector double *) (dest + 2 * inc_dest); - register __vector double *vptr_y3 = (__vector double *) (dest + 3 * inc_dest); - BLASLONG dest_t=0; - BLASLONG add_dest=inc_dest<<1; //inc_dest is already multiplied by 2, so for vector 4 we just multiply 2 times - for (i = 0; i < n; i += 4) { - - register __vector double vy_0=vptr_y[dest_t]; - register __vector double vy_1=vptr_y1[dest_t]; - register __vector double vy_2=vptr_y2[dest_t]; - register __vector double vy_3=vptr_y3[dest_t]; - - register __vector double vsrc = vptr_src[i]; - register __vector double vsrc_1 = vptr_src[i + 1]; - register __vector double vsrc_2 = vptr_src[i + 2]; - register __vector double vsrc_3 = vptr_src[i + 3]; - - vy_0 += vsrc*valpha_r; - vy_1 += vsrc_1*valpha_r; - vy_2 += vsrc_2*valpha_r; - vy_3 += vsrc_3*valpha_r; - - vsrc = vec_permi(vsrc, vsrc, 2); - vsrc_1 = vec_permi(vsrc_1, vsrc_1, 2); - vsrc_2 = vec_permi(vsrc_2, vsrc_2, 2); - vsrc_3 = vec_permi(vsrc_3, vsrc_3, 2); - - vy_0 += vsrc*valpha_i; - vy_1 += vsrc_1*valpha_i; - vy_2 += vsrc_2*valpha_i; - vy_3 += vsrc_3*valpha_i; - - vptr_y[dest_t] = vy_0; - vptr_y1[dest_t ] = vy_1; - vptr_y2[dest_t] = vy_2; - vptr_y3[dest_t] = vy_3; - - dest_t+=add_dest; - - } - - return; - } else { - register __vector double *vptr_y = (__vector double *) dest; - for (i = 0; i < n; i += 4) { - - register __vector double vy_0=vptr_y[i]; - register __vector double vy_1=vptr_y[i+1]; - register __vector double vy_2=vptr_y[i+2]; - register __vector double vy_3=vptr_y[i+3]; - - register __vector double vsrc = vptr_src[i]; - register __vector double vsrc_1 = vptr_src[i + 1]; - register __vector double vsrc_2 = vptr_src[i + 2]; - register __vector double vsrc_3 = vptr_src[i + 3]; - - vy_0 += vsrc*valpha_r; - vy_1 += vsrc_1*valpha_r; - vy_2 += vsrc_2*valpha_r; - vy_3 += vsrc_3*valpha_r; - - vsrc = vec_permi(vsrc, vsrc, 2); - vsrc_1 = vec_permi(vsrc_1, vsrc_1, 2); - vsrc_2 = vec_permi(vsrc_2, vsrc_2, 2); - vsrc_3 = vec_permi(vsrc_3, vsrc_3, 2); - - vy_0 += vsrc*valpha_i; - vy_1 += vsrc_1*valpha_i; - vy_2 += vsrc_2*valpha_i; - vy_3 += vsrc_3*valpha_i; - - vptr_y[i] = vy_0; - vptr_y[i + 1 ] = vy_1; - vptr_y[i + 2] = vy_2; - vptr_y[i + 3] = vy_3; - - } - - return; - } - return; -} - -#else - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - - if (inc_dest != 2) { - - FLOAT temp_r; - FLOAT temp_i; - for (i = 0; i < n; i++) { +static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) +{ + __asm__ volatile ( #if !defined(XCONJ) - temp_r = alpha_r * src[0] - alpha_i * src[1]; - temp_i = alpha_r * src[1] + alpha_i * src[0]; + "vlrepg %%v0,%3 \n\t" + "vleg %%v1,%4,0 \n\t" + "wflcdb %%v1,%%v1 \n\t" + "vleg %%v1,%4,1 \n\t" #else - temp_r = alpha_r * src[0] + alpha_i * src[1]; - temp_i = -alpha_r * src[1] + alpha_i * src[0]; + "vleg %%v0,%3,1 \n\t" + "vflcdb %%v0,%%v0 \n\t" + "vleg %%v0,%3,0 \n\t" + "vlrepg %%v1,%4 \n\t" #endif + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,2 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%2) \n\t" - *dest += temp_r; - *(dest + 1) += temp_i; + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,0(%%r1,%2) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,32(%%r1,%2) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "vpdi %%v24,%%v16,%%v16,4 \n\t" + "vpdi %%v25,%%v17,%%v17,4 \n\t" + "vpdi %%v26,%%v18,%%v18,4 \n\t" + "vpdi %%v27,%%v19,%%v19,4 \n\t" - src += 2; - dest += inc_dest; - } - return; - } + "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" - FLOAT temp_r0; - FLOAT temp_i0; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT temp_r2; - FLOAT temp_i2; - FLOAT temp_r3; - FLOAT temp_i3; - for (i = 0; i < n; i += 4) { -#if !defined(XCONJ) - temp_r0 = alpha_r * src[0] - alpha_i * src[1]; - temp_i0 = alpha_r * src[1] + alpha_i * src[0]; - temp_r1 = alpha_r * src[2] - alpha_i * src[3]; - temp_i1 = alpha_r * src[3] + alpha_i * src[2]; - temp_r2 = alpha_r * src[4] - alpha_i * src[5]; - temp_i2 = alpha_r * src[5] + alpha_i * src[4]; - temp_r3 = alpha_r * src[6] - alpha_i * src[7]; - temp_i3 = alpha_r * src[7] + alpha_i * src[6]; -#else - temp_r0 = alpha_r * src[0] + alpha_i * src[1]; - temp_i0 = -alpha_r * src[1] + alpha_i * src[0]; - temp_r1 = alpha_r * src[2] + alpha_i * src[3]; - temp_i1 = -alpha_r * src[3] + alpha_i * src[2]; - temp_r2 = alpha_r * src[4] + alpha_i * src[5]; - temp_i2 = -alpha_r * src[5] + alpha_i * src[4]; - temp_r3 = alpha_r * src[6] + alpha_i * src[7]; - temp_i3 = -alpha_r * src[7] + alpha_i * src[6]; -#endif + "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" - dest[0] += temp_r0; - dest[1] += temp_i0; - dest[2] += temp_r1; - dest[3] += temp_i1; - dest[4] += temp_r2; - dest[5] += temp_i2; - dest[6] += temp_r3; - dest[7] += temp_i3; - - src += 8; - dest += 8; - } - return; + "vst %%v28,0(%%r1,%2) \n\t" + "vst %%v29,16(%%r1,%2) \n\t" + "vst %%v30,32(%%r1,%2) \n\t" + "vst %%v31,48(%%r1,%2) \n\t" + + "agfi %%r1,64 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#endif - int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { - BLASLONG i; - BLASLONG j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) +{ + BLASLONG i; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; + if ( inc_dest != 2 ) + { - FLOAT xbuffer[8], *ybuffer; - - if (m < 1) return (0); - if (n < 1) return (0); - - ybuffer = buffer; - - inc_x *= 2; - inc_y *= 2; - lda *= 2; - - n1 = n / 4; - n2 = n % 4; - - m3 = m % 4; - m1 = m - (m % 4); - m2 = (m % NBMAX) - (m % 4); - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) break; - NB = m2; - } - - a_ptr = a; - - x_ptr = x; - //zero_y(NB,ybuffer); - memset(ybuffer, 0, NB * 16); - - if (inc_x == 2) { - - for (i = 0; i < n1; i++) { - zgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer); - - a_ptr += lda << 2; - x_ptr += 8; - } - - if (n2 & 2) { - zgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer); - x_ptr += 4; - a_ptr += 2 * lda; - - } - - if (n2 & 1) { - zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer); - x_ptr += 2; - a_ptr += lda; - - } - } else { - - for (i = 0; i < n1; i++) { - - xbuffer[0] = x_ptr[0]; - xbuffer[1] = x_ptr[1]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - xbuffer[3] = x_ptr[1]; - x_ptr += inc_x; - xbuffer[4] = x_ptr[0]; - xbuffer[5] = x_ptr[1]; - x_ptr += inc_x; - xbuffer[6] = x_ptr[0]; - xbuffer[7] = x_ptr[1]; - x_ptr += inc_x; - - zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer); - - a_ptr += lda << 2; - } - - for (i = 0; i < n2; i++) { - xbuffer[0] = x_ptr[0]; - xbuffer[1] = x_ptr[1]; - x_ptr += inc_x; - zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer); - a_ptr += lda; - - } - - } - - add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i); - a += 2 * NB; - y_ptr += NB * inc_y; - } - - if (m3 == 0) return (0); - - if (m3 == 1) { - a_ptr = a; - x_ptr = x; - FLOAT temp_r = 0.0; - FLOAT temp_i = 0.0; - - if (lda == 2 && inc_x == 2) { - - for (i = 0; i < (n & -2); i += 2) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; - temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; - temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3]; - temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2]; -#else - temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; - temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; - temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3]; - temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2]; -#endif - - a_ptr += 4; - x_ptr += 4; - } - - for (; i < n; i++) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; - temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; -#else - temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; - temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; -#endif - - a_ptr += 2; - x_ptr += 2; - } - - } else { - - for (i = 0; i < n; i++) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; - temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; -#else - temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; - temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; -#endif - - a_ptr += lda; - x_ptr += inc_x; - } - - } + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i -#endif - -#ifdef HAVE_KERNEL_4x4_VEC_ASM - -#elif HAVE_KERNEL_4x4_VEC - -static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - //p for positive(real*real,image*image) r for image (real*image,image*real) - register __vector double vtemp0_p = {0.0, 0.0}; - register __vector double vtemp0_r = {0.0, 0.0}; - register __vector double vtemp1_p = {0.0, 0.0}; - register __vector double vtemp1_r = {0.0, 0.0}; - register __vector double vtemp2_p = {0.0, 0.0}; - register __vector double vtemp2_r = {0.0, 0.0}; - register __vector double vtemp3_p = {0.0, 0.0}; - register __vector double vtemp3_r = {0.0, 0.0}; - i = 0; - n = n << 1; - while (i < n) { -// __builtin_prefetch(&x[i]); -// __builtin_prefetch(&a0[i]); -// __builtin_prefetch(&a1[i]); -// __builtin_prefetch(&a2[i]); -// __builtin_prefetch(&a3[i]); - register __vector double vx_0 = *(__vector double*) (&x[i]); - register __vector double vx_1 = *(__vector double*) (&x[i + 2]); - register __vector double vx_2 = *(__vector double*) (&x[i + 4]); - register __vector double vx_3 = *(__vector double*) (&x[i + 6]); - - register __vector double va0 = *(__vector double*) (&a0[i]); - register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); - register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); - register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); - - register __vector double va1 = *(__vector double*) (&a1[i]); - register __vector double va1_1 = *(__vector double*) (&a1[i + 2]); - register __vector double va1_2 = *(__vector double*) (&a1[i + 4]); - register __vector double va1_3 = *(__vector double*) (&a1[i + 6]); - - register __vector double va2 = *(__vector double*) (&a2[i]); - register __vector double va2_1 = *(__vector double*) (&a2[i + 2]); - register __vector double va2_2 = *(__vector double*) (&a2[i + 4]); - register __vector double va2_3 = *(__vector double*) (&a2[i + 6]); - - register __vector double va3 = *(__vector double*) (&a3[i]); - register __vector double va3_1 = *(__vector double*) (&a3[i + 2]); - register __vector double va3_2 = *(__vector double*) (&a3[i + 4]); - register __vector double va3_3 = *(__vector double*) (&a3[i + 6]); - - register __vector double vxr_0 = vec_permi(vx_0, vx_0, 2); - register __vector double vxr_1 = vec_permi(vx_1, vx_1, 2); - - i += 8; - - vtemp0_p += vx_0*va0; - vtemp0_r += vxr_0*va0; - - vtemp1_p += vx_0*va1; - vtemp1_r += vxr_0*va1; - - vtemp2_p += vx_0*va2; - vtemp2_r += vxr_0*va2; - - vtemp3_p += vx_0*va3; - vtemp3_r += vxr_0*va3; - - vtemp0_p += vx_1*va0_1; - vtemp0_r += vxr_1*va0_1; - - vtemp1_p += vx_1*va1_1; - vtemp1_r += vxr_1*va1_1; - vxr_0 = vec_permi(vx_2, vx_2, 2); - vtemp2_p += vx_1*va2_1; - vtemp2_r += vxr_1*va2_1; - - vtemp3_p += vx_1*va3_1; - vtemp3_r += vxr_1*va3_1; - - vtemp0_p += vx_2*va0_2; - vtemp0_r += vxr_0*va0_2; - vxr_1 = vec_permi(vx_3, vx_3, 2); - - vtemp1_p += vx_2*va1_2; - vtemp1_r += vxr_0*va1_2; - - vtemp2_p += vx_2*va2_2; - vtemp2_r += vxr_0*va2_2; - - vtemp3_p += vx_2*va3_2; - vtemp3_r += vxr_0*va3_2; - - vtemp0_p += vx_3*va0_3; - vtemp0_r += vxr_1*va0_3; - - vtemp1_p += vx_3*va1_3; - vtemp1_r += vxr_1*va1_3; - - vtemp2_p += vx_3*va2_3; - vtemp2_r += vxr_1*va2_3; - - vtemp3_p += vx_3*va3_3; - vtemp3_r += vxr_1*va3_3; - - } +static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vzero %%v16 \n\t" + "vzero %%v17 \n\t" + "vzero %%v18 \n\t" + "vzero %%v19 \n\t" + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 1,1024(%%r1,%5) \n\t" + "vl %%v20,0(%%r1,%5) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; - - register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1]; - register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1]; - - register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1]; - register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1]; - - register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1]; - register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1]; - + "vleg %%v21,8(%%r1,%5),0 \n\t" + "wflcdb %%v21,%%v21 \n\t" + "vleg %%v21,0(%%r1,%5),1 \n\t" #else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; - - register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1]; - register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1]; - - register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1]; - register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1]; - - register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1]; - register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; - y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; - y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; - -#else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; - y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; - y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; - + "vleg %%v21,0(%%r1,%5),1 \n\t" + "vflcdb %%v21,%%v21 \n\t" + "vleg %%v21,8(%%r1,%5),0 \n\t" #endif -} -#else + "vlrepg %%v24,0(%%r1,%1) \n\t" + "vlrepg %%v25,8(%%r1,%1) \n\t" + "vlrepg %%v26,0(%%r1,%2) \n\t" + "vlrepg %%v27,8(%%r1,%2) \n\t" + + "vfmadb %%v16,%%v24,%%v20,%%v16 \n\t" + "vfmadb %%v16,%%v25,%%v21,%%v16 \n\t" + "vfmadb %%v17,%%v26,%%v20,%%v17 \n\t" + "vfmadb %%v17,%%v27,%%v21,%%v17 \n\t" -static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; + "vlrepg %%v28,0(%%r1,%3) \n\t" + "vlrepg %%v29,8(%%r1,%3) \n\t" + "vlrepg %%v30,0(%%r1,%4) \n\t" + "vlrepg %%v31,8(%%r1,%4) \n\t" + + "vfmadb %%v18,%%v28,%%v20,%%v18 \n\t" + "vfmadb %%v18,%%v29,%%v21,%%v18 \n\t" + "vfmadb %%v19,%%v30,%%v20,%%v19 \n\t" + "vfmadb %%v19,%%v31,%%v21,%%v19 \n\t" - FLOAT temp_r0 = 0.0; - FLOAT temp_r1 = 0.0; - FLOAT temp_r2 = 0.0; - FLOAT temp_r3 = 0.0; - FLOAT temp_i0 = 0.0; - FLOAT temp_i1 = 0.0; - FLOAT temp_i2 = 0.0; - FLOAT temp_i3 = 0.0; - - for (i = 0; i < 2 * n; i += 2) { + "vl %%v22,16(%%r1,%5) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; - temp_r1 += a1[i] * x[i] - a1[i + 1] * x[i + 1]; - temp_i1 += a1[i] * x[i + 1] + a1[i + 1] * x[i]; - temp_r2 += a2[i] * x[i] - a2[i + 1] * x[i + 1]; - temp_i2 += a2[i] * x[i + 1] + a2[i + 1] * x[i]; - temp_r3 += a3[i] * x[i] - a3[i + 1] * x[i + 1]; - temp_i3 += a3[i] * x[i + 1] + a3[i + 1] * x[i]; + "vleg %%v23,24(%%r1,%5),0 \n\t" + "wflcdb %%v23,%%v23 \n\t" + "vleg %%v23,16(%%r1,%5),1 \n\t" #else - temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; - temp_r1 += a1[i] * x[i] + a1[i + 1] * x[i + 1]; - temp_i1 += a1[i] * x[i + 1] - a1[i + 1] * x[i]; - temp_r2 += a2[i] * x[i] + a2[i + 1] * x[i + 1]; - temp_i2 += a2[i] * x[i + 1] - a2[i + 1] * x[i]; - temp_r3 += a3[i] * x[i] + a3[i + 1] * x[i + 1]; - temp_i3 += a3[i] * x[i + 1] - a3[i + 1] * x[i]; + "vleg %%v23,16(%%r1,%5),1 \n\t" + "vflcdb %%v23,%%v23 \n\t" + "vleg %%v23,24(%%r1,%5),0 \n\t" #endif - } + "vlrepg %%v24,16(%%r1,%1) \n\t" + "vlrepg %%v25,24(%%r1,%1) \n\t" + "vlrepg %%v26,16(%%r1,%2) \n\t" + "vlrepg %%v27,24(%%r1,%2) \n\t" + + "vfmadb %%v16,%%v24,%%v22,%%v16 \n\t" + "vfmadb %%v16,%%v25,%%v23,%%v16 \n\t" + "vfmadb %%v17,%%v26,%%v22,%%v17 \n\t" + "vfmadb %%v17,%%v27,%%v23,%%v17 \n\t" + + "vlrepg %%v28,16(%%r1,%3) \n\t" + "vlrepg %%v29,24(%%r1,%3) \n\t" + "vlrepg %%v30,16(%%r1,%4) \n\t" + "vlrepg %%v31,24(%%r1,%4) \n\t" + + "vfmadb %%v18,%%v28,%%v22,%%v18 \n\t" + "vfmadb %%v18,%%v29,%%v23,%%v18 \n\t" + "vfmadb %%v19,%%v30,%%v22,%%v19 \n\t" + "vfmadb %%v19,%%v31,%%v23,%%v19 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b \n\t" + + "vpdi %%v20,%%v16,%%v16,4 \n\t" + "vpdi %%v21,%%v17,%%v17,4 \n\t" + "vpdi %%v22,%%v18,%%v18,4 \n\t" + "vpdi %%v23,%%v19,%%v19,4 \n\t" #if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; - y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; - y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; - + "vlrepg %%v24,0(%7) \n\t" + "vleg %%v25,8(%7),0 \n\t" + "wflcdb %%v25,%%v25 \n\t" + "vleg %%v25,8(%7),1 \n\t" #else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; - y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; - y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; - + "vleg %%v24,0(%7),1 \n\t" + "vflcdb %%v24,%%v24 \n\t" + "vleg %%v24,0(%7),0 \n\t" + "vlrepg %%v25,8(%7) \n\t" #endif + "vl %%v26,0(%6) \n\t" + "vl %%v27,16(%6) \n\t" + "vl %%v28,32(%6) \n\t" + "vl %%v29,48(%6) \n\t" + "vfmadb %%v26,%%v16,%%v24,%%v26 \n\t" + "vfmadb %%v26,%%v20,%%v25,%%v26 \n\t" + "vfmadb %%v27,%%v17,%%v24,%%v27 \n\t" + "vfmadb %%v27,%%v21,%%v25,%%v27 \n\t" + "vfmadb %%v28,%%v18,%%v24,%%v28 \n\t" + "vfmadb %%v28,%%v22,%%v25,%%v28 \n\t" + "vfmadb %%v29,%%v19,%%v24,%%v29 \n\t" + "vfmadb %%v29,%%v23,%%v25,%%v29 \n\t" + "vst %%v26,0(%6) \n\t" + "vst %%v27,16(%6) \n\t" + "vst %%v28,32(%6) \n\t" + "vst %%v29,48(%6) " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[8])y),"ZQ"((const FLOAT (*)[2])alpha) + :"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#endif - -#ifdef HAVE_KERNEL_4x2_VEC - -static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - //p for positive(real*real,image*image) r for image (real*image,image*real) - register __vector double vtemp0_p = {0.0, 0.0}; - register __vector double vtemp0_r = {0.0, 0.0}; - register __vector double vtemp1_p = {0.0, 0.0}; - register __vector double vtemp1_r = {0.0, 0.0}; - i = 0; - n = n << 1; - while (i < n) { - - register __vector double vx_0 = *(__vector double*) (&x[i]); - register __vector double vx_1 = *(__vector double*) (&x[i + 2]); - register __vector double vx_2 = *(__vector double*) (&x[i + 4]); - register __vector double vx_3 = *(__vector double*) (&x[i + 6]); - - register __vector double va0 = *(__vector double*) (&a0[i]); - register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); - register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); - register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); - - register __vector double va1 = *(__vector double*) (&a1[i]); - register __vector double va1_1 = *(__vector double*) (&a1[i + 2]); - register __vector double va1_2 = *(__vector double*) (&a1[i + 4]); - register __vector double va1_3 = *(__vector double*) (&a1[i + 6]); - - register __vector double vxr_0 = vec_permi(vx_0, vx_0, 2); - register __vector double vxr_1 = vec_permi(vx_1, vx_1, 2); - - i += 8; - - vtemp0_p += vx_0*va0; - vtemp0_r += vxr_0*va0; - - vtemp1_p += vx_0*va1; - vtemp1_r += vxr_0*va1; - - vxr_0 = vec_permi(vx_2, vx_2, 2); - vtemp0_p += vx_1*va0_1; - vtemp0_r += vxr_1*va0_1; - - vtemp1_p += vx_1*va1_1; - vtemp1_r += vxr_1*va1_1; - vxr_1 = vec_permi(vx_3, vx_3, 2); - - vtemp0_p += vx_2*va0_2; - vtemp0_r += vxr_0*va0_2; - - vtemp1_p += vx_2*va1_2; - vtemp1_r += vxr_0*va1_2; - - vtemp0_p += vx_3*va0_3; - vtemp0_r += vxr_1*va0_3; - - vtemp1_p += vx_3*va1_3; - vtemp1_r += vxr_1*va1_3; - - } +static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vzero %%v16 \n\t" + "vzero %%v17 \n\t" + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "vl %%v18,0(%%r1,%3) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; - - register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1]; - register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1]; - + "vleg %%v19,8(%%r1,%3),0 \n\t" + "wflcdb %%v19,%%v19 \n\t" + "vleg %%v19,0(%%r1,%3),1 \n\t" #else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; - - register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1]; - register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - -#else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - + "vleg %%v19,0(%%r1,%3),1 \n\t" + "vflcdb %%v19,%%v19 \n\t" + "vleg %%v19,8(%%r1,%3),0 \n\t" #endif -} -#else + "vlrepg %%v20,0(%%r1,%1) \n\t" + "vlrepg %%v21,8(%%r1,%1) \n\t" + "vlrepg %%v22,0(%%r1,%2) \n\t" + "vlrepg %%v23,8(%%r1,%2) \n\t" + + "vfmadb %%v16,%%v20,%%v18,%%v16 \n\t" + "vfmadb %%v16,%%v21,%%v19,%%v16 \n\t" + "vfmadb %%v17,%%v22,%%v18,%%v17 \n\t" + "vfmadb %%v17,%%v23,%%v19,%%v17 \n\t" -static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - - FLOAT temp_r0 = 0.0; - FLOAT temp_r1 = 0.0; - FLOAT temp_i0 = 0.0; - FLOAT temp_i1 = 0.0; - - for (i = 0; i < 2 * n; i += 2) { + "vl %%v18,16(%%r1,%3) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; - temp_r1 += a1[i] * x[i] - a1[i + 1] * x[i + 1]; - temp_i1 += a1[i] * x[i + 1] + a1[i + 1] * x[i]; + "vleg %%v19,24(%%r1,%3),0 \n\t" + "wflcdb %%v19,%%v19 \n\t" + "vleg %%v19,16(%%r1,%3),1 \n\t" #else - temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; - temp_r1 += a1[i] * x[i] + a1[i + 1] * x[i + 1]; - temp_i1 += a1[i] * x[i + 1] - a1[i + 1] * x[i]; + "vleg %%v19,16(%%r1,%3),1 \n\t" + "vflcdb %%v19,%%v19 \n\t" + "vleg %%v19,24(%%r1,%3),0 \n\t" #endif - } + "vlrepg %%v20,16(%%r1,%1) \n\t" + "vlrepg %%v21,24(%%r1,%1) \n\t" + "vlrepg %%v22,16(%%r1,%2) \n\t" + "vlrepg %%v23,24(%%r1,%2) \n\t" + + "vfmadb %%v16,%%v20,%%v18,%%v16 \n\t" + "vfmadb %%v16,%%v21,%%v19,%%v16 \n\t" + "vfmadb %%v17,%%v22,%%v18,%%v17 \n\t" + "vfmadb %%v17,%%v23,%%v19,%%v17 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b \n\t" + + "vpdi %%v18,%%v16,%%v16,4 \n\t" + "vpdi %%v19,%%v17,%%v17,4 \n\t" #if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - + "vlrepg %%v20,0(%5) \n\t" + "vleg %%v21,8(%5),0 \n\t" + "wflcdb %%v21,%%v21 \n\t" + "vleg %%v21,8(%5),1 \n\t" #else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - + "vleg %%v20,0(%5),1 \n\t" + "vflcdb %%v20,%%v20 \n\t" + "vleg %%v20,0(%5),0 \n\t" + "vlrepg %%v21,8(%5) \n\t" #endif + "vl %%v22,0(%4) \n\t" + "vl %%v23,16(%4) \n\t" + "vfmadb %%v22,%%v16,%%v20,%%v22 \n\t" + "vfmadb %%v22,%%v18,%%v21,%%v22 \n\t" + "vfmadb %%v23,%%v17,%%v20,%%v23 \n\t" + "vfmadb %%v23,%%v19,%%v21,%%v23 \n\t" + "vst %%v22,0(%4) \n\t" + "vst %%v23,16(%4) \n\t" + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[4])y),"ZQ"((const FLOAT (*)[2])alpha) + :"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23" + ); } -#endif - -#ifdef HAVE_KERNEL_4x1_VEC - -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0 ; - a0 = ap; - //p for positive(real*real,image*image) r for image (real*image,image*real) - register __vector double vtemp0_p = {0.0, 0.0}; - register __vector double vtemp0_r = {0.0, 0.0}; - i = 0; - n = n << 1; - while (i < n) { - - register __vector double vx_0 = *(__vector double*) (&x[i]); - register __vector double vx_1 = *(__vector double*) (&x[i + 2]); - register __vector double vx_2 = *(__vector double*) (&x[i + 4]); - register __vector double vx_3 = *(__vector double*) (&x[i + 6]); - - register __vector double va0 = *(__vector double*) (&a0[i]); - register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); - register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); - register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); - - register __vector double vxr_0 = vec_permi(vx_0, vx_0, 2); - register __vector double vxr_1 = vec_permi(vx_1, vx_1, 2); - - i += 8; - - vtemp0_p += vx_0*va0; - vtemp0_r += vxr_0*va0; - - vxr_0 = vec_permi(vx_2, vx_2, 2); - vtemp0_p += vx_1*va0_1; - vtemp0_r += vxr_1*va0_1; - - vxr_1 = vec_permi(vx_3, vx_3, 2); - - vtemp0_p += vx_2*va0_2; - vtemp0_r += vxr_0*va0_2; - - vtemp0_p += vx_3*va0_3; - vtemp0_r += vxr_1*va0_3; - - } +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vzero %%v16 \n\t" + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; - + "vleg %%v18,8(%%r1,%2),0 \n\t" + "wflcdb %%v18,%%v18 \n\t" + "vleg %%v18,0(%%r1,%2),1 \n\t" #else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - -#else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + "vleg %%v18,0(%%r1,%2),1 \n\t" + "vflcdb %%v18,%%v18 \n\t" + "vleg %%v18,8(%%r1,%2),0 \n\t" #endif -} + "vlrepg %%v19,0(%%r1,%1) \n\t" + "vlrepg %%v20,8(%%r1,%1) \n\t" + + "vfmadb %%v16,%%v19,%%v17,%%v16 \n\t" + "vfmadb %%v16,%%v20,%%v18,%%v16 \n\t" -#else - -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0; - a0 = ap; - - FLOAT temp_r0 = 0.0; - FLOAT temp_i0 = 0.0; - - for (i = 0; i < 2 * n; i += 2) { + "vl %%v17,16(%%r1,%2) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; + "vleg %%v18,24(%%r1,%2),0 \n\t" + "wflcdb %%v18,%%v18 \n\t" + "vleg %%v18,16(%%r1,%2),1 \n\t" #else - temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; + "vleg %%v18,16(%%r1,%2),1 \n\t" + "vflcdb %%v18,%%v18 \n\t" + "vleg %%v18,24(%%r1,%2),0 \n\t" #endif - } + "vlrepg %%v19,16(%%r1,%1) \n\t" + "vlrepg %%v20,24(%%r1,%1) \n\t" + + "vfmadb %%v16,%%v19,%%v17,%%v16 \n\t" + "vfmadb %%v16,%%v20,%%v18,%%v16 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b \n\t" + + "vpdi %%v17,%%v16,%%v16,4 \n\t" #if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - + "vlrepg %%v18,0(%4) \n\t" + "vleg %%v19,8(%4),0 \n\t" + "wflcdb %%v19,%%v19 \n\t" + "vleg %%v19,8(%4),1 \n\t" #else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - + "vleg %%v18,0(%4),1 \n\t" + "vflcdb %%v18,%%v18 \n\t" + "vleg %%v18,0(%4),0 \n\t" + "vlrepg %%v19,8(%4) \n\t" #endif - + "vl %%v20,0(%3) \n\t" + "vfmadb %%v20,%%v16,%%v18,%%v20 \n\t" + "vfmadb %%v20,%%v17,%%v19,%%v20 \n\t" + "vst %%v20,0(%3) \n\t" + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[2])y),"ZQ"((const FLOAT (*)[2])alpha) + :"memory","cc","r0","r1","v16","v17","v18","v19","v20" + ); } -#endif - -static __attribute__((always_inline)) void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest = *src; - *(dest + 1) = *(src + 1); - dest += 2; - src += inc_src; - } -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i; - BLASLONG j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - - FLOAT ybuffer[8], *xbuffer; - - if (m < 1) return (0); - if (n < 1) return (0); - - inc_x <<= 1; - inc_y <<= 1; - lda <<= 1; - - xbuffer = buffer; - - n1 = n >> 2; - n2 = n & 3; - - m3 = m & 3; - m1 = m - m3; - m2 = (m & (NBMAX - 1)) - m3; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) break; - NB = m2; +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for ( i=0; i> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m - m3; + m2 = (m & (NBMAX-1)) - m3 ; + + alpha[0] = alpha_r; + alpha[1] = alpha_i; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + if ( inc_x != 2 ) + copy_x(NB,x_ptr,xbuffer,inc_x); + else + xbuffer = x_ptr; + + if ( inc_y == 2 ) + { + + for( i = 0; i < n1 ; i++) + { + zgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + y_ptr += 8; + + } + + if ( n2 & 2 ) + { + zgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); + a_ptr += lda * 2; + y_ptr += 4; + + } + + if ( n2 & 1 ) + { + zgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); + /* a_ptr += lda; + y_ptr += 2; */ + + } + + } + else + { + + for( i = 0; i < n1 ; i++) + { + memset(ybuffer,0,sizeof(ybuffer)); + zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for( i = 0; i < n2 ; i++) + { + memset(ybuffer,0,sizeof(ybuffer)); + zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + + + if ( m3 == 0 ) return(0); - y_ptr = y; - a_ptr = a; x_ptr = x; + j=0; + a_ptr = a; + y_ptr = y; - if (inc_x != 2) - copy_x(NB, x_ptr, xbuffer, inc_x); - else - xbuffer = x_ptr; + if ( m3 == 3 ) + { - if (inc_y == 2) { - - for (i = 0; i < n1; i++) { - zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda << 2; - y_ptr += 8; - - } - - if (n2 & 2) { - zgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda << 1; - y_ptr += 4; - - } - - if (n2 & 1) { - zgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda; - y_ptr += 2; - - } - - } else { - - for (i = 0; i < n1; i++) { - memset(ybuffer, 0, sizeof (ybuffer)); - zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); - - a_ptr += lda << 2; - - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[2]; - y_ptr[1] += ybuffer[3]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[4]; - y_ptr[1] += ybuffer[5]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[6]; - y_ptr[1] += ybuffer[7]; - y_ptr += inc_y; - - } - - for (i = 0; i < n2; i++) { - memset(ybuffer, 0, sizeof (ybuffer)); - zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); - a_ptr += lda; - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - - } - - } - a += 2 * NB; - x += NB * inc_x; - } - - if (m3 == 0) return (0); - - x_ptr = x; - j = 0; - a_ptr = a; - y_ptr = y; - - if (m3 == 3) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x4 = x_ptr[0]; - FLOAT x5 = x_ptr[1]; - while (j < n) { + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while ( j < n) + { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; #endif #if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return (0); - } + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } - if (m3 == 2) { - FLOAT temp_r; - FLOAT temp_i; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; + if ( m3 == 2 ) + { - while (j < (n & -2)) { + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; #endif #if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; - y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; - y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j += 2; - } + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } - while (j < n) { + + while ( j < n) + { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; #endif #if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } + a_ptr += lda; + y_ptr += inc_y; + j++; + } - return (0); - } + return(0); + } - if (m3 == 1) { - FLOAT temp_r; - FLOAT temp_i; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; + if ( m3 == 1 ) + { - while (j < (n & -2)) { + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; #endif #if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; - y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; - y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j += 2; - } + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } - while (j < n) { + while ( j < n) + { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; #endif #if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return (0); - } - - return (0); + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + return(0); } - diff --git a/ztest/gemv.c b/ztest/gemv.c index f1ee972bc..964afd3ef 100644 --- a/ztest/gemv.c +++ b/ztest/gemv.c @@ -52,67 +52,66 @@ int assert_dbl_near(double exp, double real, double tol) { int zgemv_n_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i; - BLASLONG ix,iy; + BLASLONG ix, iy; BLASLONG j; FLOAT *a_ptr; - FLOAT temp_r,temp_i; - BLASLONG inc_x2,inc_y2; + FLOAT temp_r, temp_i; + BLASLONG inc_x2, inc_y2; BLASLONG lda2; BLASLONG i2; - lda2 = 2*lda; + lda2 = 2 * lda; ix = 0; a_ptr = a; - if ( inc_x == 1 && inc_y == 1 ) + if (inc_x == 1 && inc_y == 1) { - for (j=0; j Date: Fri, 4 Jan 2019 01:38:18 +0200 Subject: [PATCH 04/26] [ZARCH] fix sgemv_t_4.c --- kernel/zarch/sgemv_t_4.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c index efc06297f..fe99ef5ce 100644 --- a/kernel/zarch/sgemv_t_4.c +++ b/kernel/zarch/sgemv_t_4.c @@ -158,8 +158,6 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "brctg %%r0,2b \n\t" "3: \n\t" - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" "vrepf %%v4,%%v0,1 \n\t" "aebr %%f0,%%f4 \n\t" "vrepf %%v4,%%v0,2 \n\t" @@ -351,6 +349,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) "vl %%v31,112(%%r1,%1) \n\t" "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" "1: \n\t" "lghi %%r0,28 \n\t" From 94cd946b963e9e077cb4a4c5d93b1ce691e1fe63 Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 4 Jan 2019 17:45:56 +0200 Subject: [PATCH 05/26] [ZARCH] fix cgemv_n_4.c --- kernel/zarch/cgemv_n_4.c | 308 +++++++++++++++++++-------------------- 1 file changed, 154 insertions(+), 154 deletions(-) diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index 4c3253774..c939aea9f 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -34,107 +34,107 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { __asm__ volatile ( - "vlrepg %%v16,0(%5) \n\t" - "vlrepg %%v17,8(%5) \n\t" - "vlrepg %%v18,16(%5) \n\t" - "vlrepg %%v19,24(%5) \n\t" + "vlrepg %%v16,0(%5) \n\t" + "vlrepg %%v17,8(%5) \n\t" + "vlrepg %%v18,16(%5) \n\t" + "vlrepg %%v19,24(%5) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v20,4(%5),0 \n\t" - "vlef %%v20,4(%5),2 \n\t" - "vflcsb %%v20,%%v20 \n\t" - "vlef %%v20,0(%5),1 \n\t" - "vlef %%v20,0(%5),3 \n\t" + "vlef %%v20,4(%5),0 \n\t" + "vlef %%v20,4(%5),2 \n\t" + "vflcsb %%v20,%%v20 \n\t" + "vlef %%v20,0(%5),1 \n\t" + "vlef %%v20,0(%5),3 \n\t" - "vlef %%v21,12(%5),0 \n\t" - "vlef %%v21,12(%5),2 \n\t" - "vflcsb %%v21,%%v21 \n\t" - "vlef %%v21,8(%5),1 \n\t" - "vlef %%v21,8(%5),3 \n\t" + "vlef %%v21,12(%5),0 \n\t" + "vlef %%v21,12(%5),2 \n\t" + "vflcsb %%v21,%%v21 \n\t" + "vlef %%v21,8(%5),1 \n\t" + "vlef %%v21,8(%5),3 \n\t" - "vlef %%v22,20(%5),0 \n\t" - "vlef %%v22,20(%5),2 \n\t" - "vflcsb %%v22,%%v22 \n\t" - "vlef %%v22,16(%5),1 \n\t" - "vlef %%v22,16(%5),3 \n\t" + "vlef %%v22,20(%5),0 \n\t" + "vlef %%v22,20(%5),2 \n\t" + "vflcsb %%v22,%%v22 \n\t" + "vlef %%v22,16(%5),1 \n\t" + "vlef %%v22,16(%5),3 \n\t" - "vlef %%v23,28(%5),0 \n\t" - "vlef %%v23,28(%5),2 \n\t" - "vflcsb %%v23,%%v23 \n\t" - "vlef %%v23,24(%5),1 \n\t" - "vlef %%v23,24(%5),3 \n\t" + "vlef %%v23,28(%5),0 \n\t" + "vlef %%v23,28(%5),2 \n\t" + "vflcsb %%v23,%%v23 \n\t" + "vlef %%v23,24(%5),1 \n\t" + "vlef %%v23,24(%5),3 \n\t" #else - "vlef %%v20,0(%5),1 \n\t" - "vlef %%v20,0(%5),3 \n\t" - "vflcsb %%v20,%%v20 \n\t" - "vlef %%v20,4(%5),0 \n\t" - "vlef %%v20,4(%5),2 \n\t" + "vlef %%v20,0(%5),1 \n\t" + "vlef %%v20,0(%5),3 \n\t" + "vflcsb %%v20,%%v20 \n\t" + "vlef %%v20,4(%5),0 \n\t" + "vlef %%v20,4(%5),2 \n\t" - "vlef %%v21,8(%5),1 \n\t" - "vlef %%v21,8(%5),3 \n\t" - "vflcsb %%v21,%%v21 \n\t" - "vlef %%v21,12(%5),0 \n\t" - "vlef %%v21,12(%5),2 \n\t" + "vlef %%v21,8(%5),1 \n\t" + "vlef %%v21,8(%5),3 \n\t" + "vflcsb %%v21,%%v21 \n\t" + "vlef %%v21,12(%5),0 \n\t" + "vlef %%v21,12(%5),2 \n\t" - "vlef %%v22,16(%5),1 \n\t" - "vlef %%v22,16(%5),3 \n\t" - "vflcsb %%v22,%%v22 \n\t" - "vlef %%v22,20(%5),0 \n\t" - "vlef %%v22,20(%5),2 \n\t" + "vlef %%v22,16(%5),1 \n\t" + "vlef %%v22,16(%5),3 \n\t" + "vflcsb %%v22,%%v22 \n\t" + "vlef %%v22,20(%5),0 \n\t" + "vlef %%v22,20(%5),2 \n\t" - "vlef %%v23,24(%5),1 \n\t" - "vlef %%v23,24(%5),3 \n\t" - "vflcsb %%v23,%%v23 \n\t" - "vlef %%v23,28(%5),0 \n\t" - "vlef %%v23,28(%5),2 \n\t" + "vlef %%v23,24(%5),1 \n\t" + "vlef %%v23,24(%5),3 \n\t" + "vflcsb %%v23,%%v23 \n\t" + "vlef %%v23,28(%5),0 \n\t" + "vlef %%v23,28(%5),2 \n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%%r0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 2,1024(%%r1,%6) \n\t" + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" - "vlef %%v24,0(%%r1,%1),0 \n\t" - "vlef %%v24,0(%%r1,%1),1 \n\t" - "vlef %%v24,8(%%r1,%1),2 \n\t" - "vlef %%v24,8(%%r1,%1),3 \n\t" - "vlef %%v25,4(%%r1,%1),0 \n\t" - "vlef %%v25,4(%%r1,%1),1 \n\t" - "vlef %%v25,12(%%r1,%1),2 \n\t" - "vlef %%v25,12(%%r1,%1),3 \n\t" - "vlef %%v26,0(%%r1,%2),0 \n\t" - "vlef %%v26,0(%%r1,%2),1 \n\t" - "vlef %%v26,8(%%r1,%2),2 \n\t" - "vlef %%v26,8(%%r1,%2),3 \n\t" - "vlef %%v27,4(%%r1,%2),0 \n\t" - "vlef %%v27,4(%%r1,%2),1 \n\t" - "vlef %%v27,12(%%r1,%2),2 \n\t" - "vlef %%v27,12(%%r1,%2),3 \n\t" + "vlef %%v24,0(%%r1,%1),0 \n\t" + "vlef %%v24,0(%%r1,%1),1 \n\t" + "vlef %%v24,8(%%r1,%1),2 \n\t" + "vlef %%v24,8(%%r1,%1),3 \n\t" + "vlef %%v25,4(%%r1,%1),0 \n\t" + "vlef %%v25,4(%%r1,%1),1 \n\t" + "vlef %%v25,12(%%r1,%1),2 \n\t" + "vlef %%v25,12(%%r1,%1),3 \n\t" + "vlef %%v26,0(%%r1,%2),0 \n\t" + "vlef %%v26,0(%%r1,%2),1 \n\t" + "vlef %%v26,8(%%r1,%2),2 \n\t" + "vlef %%v26,8(%%r1,%2),3 \n\t" + "vlef %%v27,4(%%r1,%2),0 \n\t" + "vlef %%v27,4(%%r1,%2),1 \n\t" + "vlef %%v27,12(%%r1,%2),2 \n\t" + "vlef %%v27,12(%%r1,%2),3 \n\t" - "vl %%v0,0(%%r1,%6) \n\t" - "vfmasb %%v0,%%v24,%%v16,%%v0 \n\t" - "vfmasb %%v0,%%v25,%%v20,%%v0 \n\t" - "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" - "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" + "vl %%v0,0(%%r1,%6) \n\t" + "vfmasb %%v0,%%v24,%%v16,%%v0 \n\t" + "vfmasb %%v0,%%v25,%%v20,%%v0 \n\t" + "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" + "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" - "vlef %%v28,0(%%r1,%1),0 \n\t" - "vlef %%v28,0(%%r1,%1),1 \n\t" - "vlef %%v28,8(%%r1,%1),2 \n\t" - "vlef %%v28,8(%%r1,%1),3 \n\t" - "vlef %%v29,4(%%r1,%1),0 \n\t" - "vlef %%v29,4(%%r1,%1),1 \n\t" - "vlef %%v29,12(%%r1,%1),2 \n\t" - "vlef %%v29,12(%%r1,%1),3 \n\t" - "vlef %%v30,0(%%r1,%2),0 \n\t" - "vlef %%v30,0(%%r1,%2),1 \n\t" - "vlef %%v30,8(%%r1,%2),2 \n\t" - "vlef %%v30,8(%%r1,%2),3 \n\t" - "vlef %%v31,4(%%r1,%2),0 \n\t" - "vlef %%v31,4(%%r1,%2),1 \n\t" - "vlef %%v31,12(%%r1,%2),2 \n\t" - "vlef %%v31,12(%%r1,%2),3 \n\t" + "vlef %%v28,0(%%r1,%1),0 \n\t" + "vlef %%v28,0(%%r1,%1),1 \n\t" + "vlef %%v28,8(%%r1,%1),2 \n\t" + "vlef %%v28,8(%%r1,%1),3 \n\t" + "vlef %%v29,4(%%r1,%1),0 \n\t" + "vlef %%v29,4(%%r1,%1),1 \n\t" + "vlef %%v29,12(%%r1,%1),2 \n\t" + "vlef %%v29,12(%%r1,%1),3 \n\t" + "vlef %%v30,0(%%r1,%2),0 \n\t" + "vlef %%v30,0(%%r1,%2),1 \n\t" + "vlef %%v30,8(%%r1,%2),2 \n\t" + "vlef %%v30,8(%%r1,%2),3 \n\t" + "vlef %%v31,4(%%r1,%2),0 \n\t" + "vlef %%v31,4(%%r1,%2),1 \n\t" + "vlef %%v31,12(%%r1,%2),2 \n\t" + "vlef %%v31,12(%%r1,%2),3 \n\t" "vfmasb %%v0,%%v28,%%v18,%%v0 \n\t" "vfmasb %%v0,%%v29,%%v22,%%v0 \n\t" @@ -153,56 +153,56 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { __asm__ volatile ( - "vlrepg %%v16,0(%3) \n\t" - "vlrepg %%v17,8(%3) \n\t" + "vlrepg %%v16,0(%3) \n\t" + "vlrepg %%v17,8(%3) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v18,4(%3),0 \n\t" - "vlef %%v18,4(%3),2 \n\t" - "vflcsb %%v18,%%v18 \n\t" - "vlef %%v18,0(%3),1 \n\t" - "vlef %%v18,0(%3),3 \n\t" + "vlef %%v18,4(%3),0 \n\t" + "vlef %%v18,4(%3),2 \n\t" + "vflcsb %%v18,%%v18 \n\t" + "vlef %%v18,0(%3),1 \n\t" + "vlef %%v18,0(%3),3 \n\t" - "vlef %%v19,12(%3),0 \n\t" - "vlef %%v19,12(%3),2 \n\t" - "vflcsb %%v19,%%v19 \n\t" - "vlef %%v19,8(%3),1 \n\t" - "vlef %%v19,8(%3),3 \n\t" + "vlef %%v19,12(%3),0 \n\t" + "vlef %%v19,12(%3),2 \n\t" + "vflcsb %%v19,%%v19 \n\t" + "vlef %%v19,8(%3),1 \n\t" + "vlef %%v19,8(%3),3 \n\t" #else - "vlef %%v18,0(%3),1 \n\t" - "vlef %%v18,0(%3),3 \n\t" - "vflcsb %%v18,%%v18 \n\t" - "vlef %%v18,4(%3),0 \n\t" - "vlef %%v18,4(%3),2 \n\t" + "vlef %%v18,0(%3),1 \n\t" + "vlef %%v18,0(%3),3 \n\t" + "vflcsb %%v18,%%v18 \n\t" + "vlef %%v18,4(%3),0 \n\t" + "vlef %%v18,4(%3),2 \n\t" - "vlef %%v19,8(%3),1 \n\t" - "vlef %%v19,8(%3),3 \n\t" - "vflcsb %%v19,%%v19 \n\t" - "vlef %%v19,12(%3),0 \n\t" - "vlef %%v19,12(%3),2 \n\t" + "vlef %%v19,8(%3),1 \n\t" + "vlef %%v19,8(%3),3 \n\t" + "vflcsb %%v19,%%v19 \n\t" + "vlef %%v19,12(%3),0 \n\t" + "vlef %%v19,12(%3),2 \n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%%r0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%4) \n\t" + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" - "vlef %%v20,0(%%r1,%1),0 \n\t" - "vlef %%v20,0(%%r1,%1),1 \n\t" - "vlef %%v20,8(%%r1,%1),2 \n\t" - "vlef %%v20,8(%%r1,%1),3 \n\t" - "vlef %%v21,4(%%r1,%1),0 \n\t" - "vlef %%v21,4(%%r1,%1),1 \n\t" - "vlef %%v21,12(%%r1,%1),2 \n\t" - "vlef %%v21,12(%%r1,%1),3 \n\t" - "vlef %%v22,0(%%r1,%2),0 \n\t" - "vlef %%v22,0(%%r1,%2),1 \n\t" - "vlef %%v22,8(%%r1,%2),2 \n\t" - "vlef %%v22,8(%%r1,%2),3 \n\t" - "vlef %%v23,4(%%r1,%2),0 \n\t" - "vlef %%v23,4(%%r1,%2),1 \n\t" - "vlef %%v23,12(%%r1,%2),2 \n\t" - "vlef %%v23,12(%%r1,%2),3 \n\t" + "vlef %%v20,0(%%r1,%1),0 \n\t" + "vlef %%v20,0(%%r1,%1),1 \n\t" + "vlef %%v20,8(%%r1,%1),2 \n\t" + "vlef %%v20,8(%%r1,%1),3 \n\t" + "vlef %%v21,4(%%r1,%1),0 \n\t" + "vlef %%v21,4(%%r1,%1),1 \n\t" + "vlef %%v21,12(%%r1,%1),2 \n\t" + "vlef %%v21,12(%%r1,%1),3 \n\t" + "vlef %%v22,0(%%r1,%2),0 \n\t" + "vlef %%v22,0(%%r1,%2),1 \n\t" + "vlef %%v22,8(%%r1,%2),2 \n\t" + "vlef %%v22,8(%%r1,%2),3 \n\t" + "vlef %%v23,4(%%r1,%2),0 \n\t" + "vlef %%v23,4(%%r1,%2),1 \n\t" + "vlef %%v23,12(%%r1,%2),2 \n\t" + "vlef %%v23,12(%%r1,%2),3 \n\t" "vl %%v0,0(%%r1,%4) \n\t" "vfmasb %%v0,%%v20,%%v16,%%v0 \n\t" @@ -222,34 +222,34 @@ static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { __asm__ volatile ( - "vlrepg %%v16,0(%2) \n\t" + "vlrepg %%v16,0(%2) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vlef %%v17,4(%2),0 \n\t" - "vlef %%v17,4(%2),2 \n\t" + "vlef %%v17,4(%2),2 \n\t" "vflcsb %%v17,%%v17 \n\t" "vlef %%v17,0(%2),1 \n\t" - "vlef %%v17,0(%2),3 \n\t" + "vlef %%v17,0(%2),3 \n\t" #else "vlef %%v17,0(%2),1 \n\t" - "vlef %%v17,0(%2),3 \n\t" + "vlef %%v17,0(%2),3 \n\t" "vflcsb %%v17,%%v17 \n\t" "vlef %%v17,4(%2),0 \n\t" - "vlef %%v17,4(%2),2 \n\t" + "vlef %%v17,4(%2),2 \n\t" #endif "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%%r0,1 \n\t" + "srlg %%r0,%0,1 \n\t" "0: \n\t" "pfd 1,1024(%%r1,%1) \n\t" "pfd 2,1024(%%r1,%3) \n\t" - "vlef %%v18,0(%%r1,%1),0 \n\t" - "vlef %%v18,0(%%r1,%1),1 \n\t" - "vlef %%v18,8(%%r1,%1),2 \n\t" - "vlef %%v18,8(%%r1,%1),3 \n\t" - "vlef %%v19,4(%%r1,%1),0 \n\t" - "vlef %%v19,4(%%r1,%1),1 \n\t" - "vlef %%v19,12(%%r1,%1),2 \n\t" - "vlef %%v19,12(%%r1,%1),3 \n\t" + "vlef %%v18,0(%%r1,%1),0 \n\t" + "vlef %%v18,0(%%r1,%1),1 \n\t" + "vlef %%v18,8(%%r1,%1),2 \n\t" + "vlef %%v18,8(%%r1,%1),3 \n\t" + "vlef %%v19,4(%%r1,%1),0 \n\t" + "vlef %%v19,4(%%r1,%1),1 \n\t" + "vlef %%v19,12(%%r1,%1),2 \n\t" + "vlef %%v19,12(%%r1,%1),3 \n\t" "vl %%v0,0(%%r1,%3) \n\t" "vfmasb %%v0,%%v18,%%v16,%%v0 \n\t" @@ -268,18 +268,18 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT al { __asm__ volatile ( #if !defined(XCONJ) - "vlrepf %%v0,%3 \n\t" - "vlef %%v1,%4,0 \n\t" - "vlef %%v1,%4,2 \n\t" + "vlrepf %%v0,%3 \n\t" + "vlef %%v1,%4,0 \n\t" + "vlef %%v1,%4,2 \n\t" "vflcsb %%v1,%%v1 \n\t" - "vlef %%v1,%4,1 \n\t" + "vlef %%v1,%4,1 \n\t" "vlef %%v1,%4,3 \n\t" #else "vlef %%v0,%3,1 \n\t" - "vlef %%v0,%3,3 \n\t" + "vlef %%v0,%3,3 \n\t" "vflcsb %%v0,%%v0 \n\t" "vlef %%v0,%3,0 \n\t" - "vlef %%v0,%3,2 \n\t" + "vlef %%v0,%3,2 \n\t" "vlrepf %%v1,%4 \n\t" #endif "xgr %%r1,%%r1 \n\t" @@ -292,7 +292,7 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT al "vl %%v17,16(%%r1,%1) \n\t" "vl %%v18,0(%%r1,%2) \n\t" "vl %%v19,16(%%r1,%2) \n\t" - "verllg %%v20,%%v16,32 \n\t" + "verllg %%v20,%%v16,32 \n\t" "verllg %%v21,%%v17,32 \n\t" "vfmasb %%v22,%%v16,%%v0,%%v18 \n\t" From 3eafcfa6507891f7fff781423d9eb6af13501133 Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 9 Jan 2019 07:43:45 +0200 Subject: [PATCH 06/26] [ZARCH] fix cgemv_n_4.c --- kernel/zarch/cgemv_n_4.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index c939aea9f..7b5e43497 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -119,22 +119,22 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" - "vlef %%v28,0(%%r1,%1),0 \n\t" - "vlef %%v28,0(%%r1,%1),1 \n\t" - "vlef %%v28,8(%%r1,%1),2 \n\t" - "vlef %%v28,8(%%r1,%1),3 \n\t" - "vlef %%v29,4(%%r1,%1),0 \n\t" - "vlef %%v29,4(%%r1,%1),1 \n\t" - "vlef %%v29,12(%%r1,%1),2 \n\t" - "vlef %%v29,12(%%r1,%1),3 \n\t" - "vlef %%v30,0(%%r1,%2),0 \n\t" - "vlef %%v30,0(%%r1,%2),1 \n\t" - "vlef %%v30,8(%%r1,%2),2 \n\t" - "vlef %%v30,8(%%r1,%2),3 \n\t" - "vlef %%v31,4(%%r1,%2),0 \n\t" - "vlef %%v31,4(%%r1,%2),1 \n\t" - "vlef %%v31,12(%%r1,%2),2 \n\t" - "vlef %%v31,12(%%r1,%2),3 \n\t" + "vlef %%v28,0(%%r1,%3),0 \n\t" + "vlef %%v28,0(%%r1,%3),1 \n\t" + "vlef %%v28,8(%%r1,%3),2 \n\t" + "vlef %%v28,8(%%r1,%3),3 \n\t" + "vlef %%v29,4(%%r1,%3),0 \n\t" + "vlef %%v29,4(%%r1,%3),1 \n\t" + "vlef %%v29,12(%%r1,%3),2 \n\t" + "vlef %%v29,12(%%r1,%3),3 \n\t" + "vlef %%v30,0(%%r1,%4),0 \n\t" + "vlef %%v30,0(%%r1,%4),1 \n\t" + "vlef %%v30,8(%%r1,%4),2 \n\t" + "vlef %%v30,8(%%r1,%4),3 \n\t" + "vlef %%v31,4(%%r1,%4),0 \n\t" + "vlef %%v31,4(%%r1,%4),1 \n\t" + "vlef %%v31,12(%%r1,%4),2 \n\t" + "vlef %%v31,12(%%r1,%4),3 \n\t" "vfmasb %%v0,%%v28,%%v18,%%v0 \n\t" "vfmasb %%v0,%%v29,%%v22,%%v0 \n\t" From e7455f500c06ecda4085d560ffa20c5bc188416f Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 9 Jan 2019 16:33:54 +0200 Subject: [PATCH 07/26] [ZARCH] fix dsdot.c --- kernel/zarch/dsdot.c | 119 ++++++++++++++++++++----------------------- 1 file changed, 54 insertions(+), 65 deletions(-) diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c index 17461a029..800bb0d51 100644 --- a/kernel/zarch/dsdot.c +++ b/kernel/zarch/dsdot.c @@ -27,61 +27,34 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static double dsdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { double dot; __asm__ volatile ( "vzero %%v0 \n\t" - "srlg %%r0,%1,5 \n\t" + "srlg %%r0,%1,4 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%3) \n\t" - "vfmsb %%v16,%%v16,%%v24 \n\t" - "vl %%v25,16(%%r1,%3) \n\t" - "vfmsb %%v17,%%v17,%%v25 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmsb %%v18,%%v18,%%v26 \n\t" - "vl %%v27,48(%%r1,%3) \n\t" - "vfmsb %%v19,%%v19,%%v27 \n\t" - "vl %%v28,64(%%r1,%3) \n\t" - "vfmsb %%v20,%%v20,%%v28 \n\t" - "vl %%v29,80(%%r1,%3) \n\t" - "vfmsb %%v21,%%v21,%%v29 \n\t" - "vl %%v30,96(%%r1,%3) \n\t" - "vfmsb %%v22,%%v22,%%v30 \n\t" - "vl %%v31,112(%%r1,%3) \n\t" - "vfmsb %%v23,%%v23,%%v31 \n\t" - - "vflls %%v24,%%v16 \n\t" - "vflls %%v25,%%v17 \n\t" - "vflls %%v26,%%v18 \n\t" - "vflls %%v27,%%v19 \n\t" - "vflls %%v28,%%v20 \n\t" - "vflls %%v29,%%v21 \n\t" - "vflls %%v30,%%v22 \n\t" - "vflls %%v31,%%v23 \n\t" - - "veslg %%v16,%%v16,32 \n\t" - "veslg %%v17,%%v17,32 \n\t" - "veslg %%v18,%%v18,32 \n\t" - "veslg %%v19,%%v19,32 \n\t" - "veslg %%v20,%%v20,32 \n\t" - "veslg %%v21,%%v21,32 \n\t" - "veslg %%v22,%%v22,32 \n\t" - "veslg %%v23,%%v23,32 \n\t" + "vlef %%v16,0(%%r1,%2),0 \n\t" + "vlef %%v16,4(%%r1,%2),2 \n\t" + "vlef %%v17,8(%%r1,%2),0 \n\t" + "vlef %%v17,12(%%r1,%2),2 \n\t" + "vlef %%v18,16(%%r1,%2),0 \n\t" + "vlef %%v18,20(%%r1,%2),2 \n\t" + "vlef %%v19,24(%%r1,%2),0 \n\t" + "vlef %%v19,28(%%r1,%2),2 \n\t" + "vlef %%v20,32(%%r1,%2),0 \n\t" + "vlef %%v20,36(%%r1,%2),2 \n\t" + "vlef %%v21,40(%%r1,%2),0 \n\t" + "vlef %%v21,44(%%r1,%2),2 \n\t" + "vlef %%v22,48(%%r1,%2),0 \n\t" + "vlef %%v22,52(%%r1,%2),2 \n\t" + "vlef %%v23,56(%%r1,%2),0 \n\t" + "vlef %%v23,60(%%r1,%2),2 \n\t" "vflls %%v16,%%v16 \n\t" "vflls %%v17,%%v17 \n\t" @@ -92,24 +65,40 @@ static double dsdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) "vflls %%v22,%%v22 \n\t" "vflls %%v23,%%v23 \n\t" - "vfadb %%v16,%%v16,%%v24 \n\t" - "vfadb %%v17,%%v17,%%v25 \n\t" - "vfadb %%v18,%%v18,%%v26 \n\t" - "vfadb %%v19,%%v19,%%v27 \n\t" - "vfadb %%v20,%%v20,%%v28 \n\t" - "vfadb %%v21,%%v21,%%v29 \n\t" - "vfadb %%v22,%%v22,%%v30 \n\t" - "vfadb %%v23,%%v23,%%v31 \n\t" - "vfadb %%v16,%%v16,%%v20 \n\t" - "vfadb %%v17,%%v17,%%v21 \n\t" - "vfadb %%v18,%%v18,%%v22 \n\t" - "vfadb %%v19,%%v19,%%v23 \n\t" - "vfadb %%v16,%%v16,%%v18 \n\t" - "vfadb %%v17,%%v17,%%v19 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v0,%%v16,%%v0 \n\t" + "vlef %%v24,0(%%r1,%3),0 \n\t" + "vlef %%v24,4(%%r1,%3),2 \n\t" + "vflls %%v24,%%v24 \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vlef %%v25,8(%%r1,%3),0 \n\t" + "vlef %%v25,12(%%r1,%3),2 \n\t" + "vflls %%v25,%%v25 \n\t" + "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" + "vlef %%v26,16(%%r1,%3),0 \n\t" + "vlef %%v26,20(%%r1,%3),2 \n\t" + "vflls %%v26,%%v26 \n\t" + "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" + "vlef %%v27,24(%%r1,%3),0 \n\t" + "vlef %%v27,28(%%r1,%3),2 \n\t" + "vflls %%v27,%%v27 \n\t" + "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" + "vlef %%v28,32(%%r1,%3),0 \n\t" + "vlef %%v28,36(%%r1,%3),2 \n\t" + "vflls %%v28,%%v28 \n\t" + "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" + "vlef %%v29,40(%%r1,%3),0 \n\t" + "vlef %%v29,44(%%r1,%3),2 \n\t" + "vflls %%v29,%%v29 \n\t" + "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" + "vlef %%v30,48(%%r1,%3),0 \n\t" + "vlef %%v30,52(%%r1,%3),2 \n\t" + "vflls %%v30,%%v30 \n\t" + "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" + "vlef %%v31,56(%%r1,%3),0 \n\t" + "vlef %%v31,60(%%r1,%3),2 \n\t" + "vflls %%v31,%%v31 \n\t" + "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" - "agfi %%r1,128 \n\t" + "agfi %%r1,64 \n\t" "brctg %%r0,0b \n\t" "vrepg %%v1,%%v0,1 \n\t" "adbr %%f0,%%f1 \n\t" @@ -134,10 +123,10 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1) ) { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -16; if ( n1 ) - dot = dsdot_kernel_32(n1,x,y); + dot = dsdot_kernel_16(n1,x,y); i = n1; while(i < n) From c2ffef81569624cc530d515bbaac9890d819253b Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 9 Jan 2019 16:49:44 +0200 Subject: [PATCH 08/26] [ZARCH] fix data prefetch type in ddot --- kernel/zarch/ddot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c index f34d1e96e..ff4c347a6 100644 --- a/kernel/zarch/ddot.c +++ b/kernel/zarch/ddot.c @@ -37,7 +37,7 @@ static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%2) \n\t" From be66f5d5c21b558dd1ef35dc8f4bda6b544b4f79 Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 9 Jan 2019 16:50:07 +0200 Subject: [PATCH 09/26] [ZARCH] fix data prefetch type in sdot --- kernel/zarch/sdot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/sdot.c b/kernel/zarch/sdot.c index fd8c8e445..5ddbc69bd 100644 --- a/kernel/zarch/sdot.c +++ b/kernel/zarch/sdot.c @@ -37,7 +37,7 @@ static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%2) \n\t" From 67432b23c2fe7f8ef29cf85821278dcdf69b4db2 Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 16:44:46 +0200 Subject: [PATCH 10/26] [ZARCH] fix cgemv_n_4.c --- kernel/zarch/cgemv_n_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index 7b5e43497..a45c3d687 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -396,7 +396,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, ap[3] = ap[2] + lda; x_ptr = x; //zero_y(NB,ybuffer); - memset(ybuffer,0,NB*16); + memset(ybuffer,0,NB*8); if ( inc_x == 2 ) { From 5d89d6b143ea770e4dcb2336319b543f2297c6ba Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:08:24 +0200 Subject: [PATCH 11/26] [ZARCH] fix sgemv_n_4.c --- kernel/zarch/sgemv_n_4.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/zarch/sgemv_n_4.c b/kernel/zarch/sgemv_n_4.c index 92019d732..01d8414de 100644 --- a/kernel/zarch/sgemv_n_4.c +++ b/kernel/zarch/sgemv_n_4.c @@ -435,7 +435,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO ap[3] = ap[2] + lda; if ( inc_y != 1 ) - memset(ybuffer,0,NB*8); + memset(ybuffer,0,NB*4); else ybuffer = y_ptr; @@ -465,8 +465,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( n2 & 1 ) { sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - a_ptr += lda; - x_ptr += 1; + /* a_ptr += lda; + x_ptr += 1; */ } From ecc31b743fc93d3b5951e83e6e37148dbdd381c8 Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:13:02 +0200 Subject: [PATCH 12/26] Update dgemv_t_4.c --- kernel/zarch/dgemv_t_4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index f9c1f966d..2d8fa0d10 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -601,9 +601,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO { dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); - a_ptr += lda; + // a_ptr += lda; *y_ptr += ybuffer[0] * alpha; - y_ptr += inc_y; + // y_ptr += inc_y; } a += NB; From b731e8246f9fad13637005be39d8566111bab9fe Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:14:04 +0200 Subject: [PATCH 13/26] Update sgemv_t_4.c --- kernel/zarch/sgemv_t_4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c index fe99ef5ce..5515d7bb7 100644 --- a/kernel/zarch/sgemv_t_4.c +++ b/kernel/zarch/sgemv_t_4.c @@ -605,9 +605,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO { sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); - a_ptr += lda; + // a_ptr += lda; *y_ptr += ybuffer[0] * alpha; - y_ptr += inc_y; + // y_ptr += inc_y; } a += NB; From 621dedb37bd1d33c7006c305b4057bb0cc7ea7cd Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:37:11 +0200 Subject: [PATCH 14/26] [ZARCH] Update cgemv_t_4.c --- kernel/zarch/cgemv_t_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/cgemv_t_4.c b/kernel/zarch/cgemv_t_4.c index 89914fb1f..0dd43057c 100644 --- a/kernel/zarch/cgemv_t_4.c +++ b/kernel/zarch/cgemv_t_4.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#define NBMAX 1024 +#define NBMAX 2048 static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { From 406f835f00fedcfef894742b30a7f48905836eee Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:39:17 +0200 Subject: [PATCH 15/26] [ZARCH] update cgemv_n_4.c --- kernel/zarch/cgemv_n_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index a45c3d687..ed81325e1 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -#define NBMAX 1024 +#define NBMAX 2048 static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { From 1a7925b3a335114d26bd1d25d6f6fdc2743909b6 Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:43:11 +0200 Subject: [PATCH 16/26] [ZARCH] Update dgemv_n_4.c --- kernel/zarch/dgemv_n_4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index ca6d287bc..ca4fd6170 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -488,8 +488,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( n2 & 1 ) { dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - a_ptr += lda; - x_ptr += 1; + /* a_ptr += lda; + x_ptr += 1; */ } From b815a04c87e49a01e66e1c41ce4654f8d7817f83 Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 15 Jan 2019 21:04:22 +0200 Subject: [PATCH 17/26] [ZARCH] fix a bug in max/min functions --- kernel/zarch/camax.c | 2 +- kernel/zarch/camin.c | 2 +- kernel/zarch/damax.c | 2 +- kernel/zarch/damin.c | 2 +- kernel/zarch/dmax.c | 2 +- kernel/zarch/dmin.c | 2 +- kernel/zarch/idamax.c | 2 +- kernel/zarch/idamin.c | 2 +- kernel/zarch/idmax.c | 2 +- kernel/zarch/idmin.c | 2 +- kernel/zarch/isamax.c | 2 +- kernel/zarch/isamin.c | 2 +- kernel/zarch/ismax.c | 2 +- kernel/zarch/ismin.c | 2 +- kernel/zarch/samax.c | 2 +- kernel/zarch/samin.c | 2 +- kernel/zarch/smax.c | 2 +- kernel/zarch/smin.c | 2 +- kernel/zarch/zamax.c | 2 +- kernel/zarch/zamin.c | 2 +- 20 files changed, 20 insertions(+), 20 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 3506c4e9b..2c913b62e 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -237,7 +237,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i++; BLASLONG n1 = (n - 1) & -4; - while (i < n1) { + while ((i - 1) < n1) { if (CABS1(x,ix) > maxf) { maxf = CABS1(x,ix); diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 726747b99..733f98fbf 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -237,7 +237,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i++; BLASLONG n1 = (n - 1) & -4; - while (i < n1) { + while ((i - 1) < n1) { if (CABS1(x,ix) < minf) { minf = CABS1(x,ix); diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index b74af5d37..236d11c72 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -172,7 +172,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) > maxf) { maxf = ABS(x[i]); diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index 4cf5e88b1..c2c63c6c5 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -172,7 +172,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) < minf) { minf = ABS(x[i]); diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index de38bd21a..469f65735 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -148,7 +148,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] > maxf) { maxf = x[i]; diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index d7c86735f..3df504950 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -148,7 +148,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] < minf) { minf = x[i]; diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index d1f135369..4f7ff6985 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -226,7 +226,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) > maxf) { max = j; diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 679606a8f..3abc7a558 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -226,7 +226,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) < minf) { min = j; diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 5de41ac7b..313a88db4 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -202,7 +202,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] > maxf) { max = j; diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index 7fec111cf..42443215b 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -202,7 +202,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] < minf) { min = j; diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index d2686c0cd..dd2144db2 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -269,7 +269,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) > maxf) { max = j; diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 768f31a8c..d7e44421d 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -269,7 +269,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) < minf) { min = j; diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 8fc32adf6..1ebc6c8c8 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -245,7 +245,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] > maxf) { max = j; diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index 415052810..a6b9d59de 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -245,7 +245,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] < minf) { min = j; diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c index 1025cfcbf..61d50159f 100644 --- a/kernel/zarch/samax.c +++ b/kernel/zarch/samax.c @@ -176,7 +176,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) > maxf) { maxf = ABS(x[i]); diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c index 3b8f03e6a..a585a79ff 100644 --- a/kernel/zarch/samin.c +++ b/kernel/zarch/samin.c @@ -176,7 +176,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) < minf) { minf = ABS(x[i]); diff --git a/kernel/zarch/smax.c b/kernel/zarch/smax.c index 33798eb7c..bcdb473af 100644 --- a/kernel/zarch/smax.c +++ b/kernel/zarch/smax.c @@ -152,7 +152,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] > maxf) { maxf = x[i]; diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c index e882b7ff1..91c31d284 100644 --- a/kernel/zarch/smin.c +++ b/kernel/zarch/smin.c @@ -152,7 +152,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] < minf) { minf = x[i]; diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index 937bc9753..8ef3f42ca 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -189,7 +189,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i++; BLASLONG n1 = (n - 1) & -4; - while (i < n1) { + while ((i - 1) < n1) { if (CABS1(x,ix) > maxf) { maxf = CABS1(x,ix); diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index 8564edaf4..30fd1d030 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -189,7 +189,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i++; BLASLONG n1 = (n - 1) & -4; - while (i < n1) { + while ((i - 1) < n1) { if (CABS1(x,ix) < minf) { minf = CABS1(x,ix); From b111829226874550c524b36882ff84c90008f494 Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 21 Jan 2019 15:56:04 +0200 Subject: [PATCH 18/26] [ZARCH] Update max/min functions --- kernel/zarch/camax.c | 160 +++++++++++++++++-------------------- kernel/zarch/camin.c | 178 +++++++++++++++++++----------------------- kernel/zarch/damax.c | 100 +++++++----------------- kernel/zarch/damin.c | 102 ++++++++---------------- kernel/zarch/dmax.c | 81 +++++++------------ kernel/zarch/dmin.c | 81 +++++++------------ kernel/zarch/icamax.c | 33 ++++---- kernel/zarch/icamin.c | 31 ++++---- kernel/zarch/idamax.c | 51 ++++++------ kernel/zarch/idamin.c | 51 ++++++------ kernel/zarch/idmax.c | 51 ++++++------ kernel/zarch/idmin.c | 51 ++++++------ kernel/zarch/isamax.c | 55 +++++++------ kernel/zarch/isamin.c | 55 +++++++------ kernel/zarch/ismax.c | 55 +++++++------ kernel/zarch/ismin.c | 55 +++++++------ kernel/zarch/izamax.c | 27 ++++--- kernel/zarch/izamin.c | 27 ++++--- kernel/zarch/samax.c | 103 ++++++++---------------- kernel/zarch/samin.c | 103 ++++++++---------------- kernel/zarch/smax.c | 84 +++++++------------- kernel/zarch/smin.c | 84 +++++++------------- kernel/zarch/zamax.c | 116 ++++++++++++--------------- kernel/zarch/zamin.c | 116 ++++++++++++--------------- 24 files changed, 769 insertions(+), 1081 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 2c913b62e..66d250896 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -55,7 +55,7 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" - "pfd 1, 1024(%2) \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" "vlef %%v16,0(%%r1,%2),0 \n\t" "vlef %%v17,4(%%r1,%2),0 \n\t" @@ -93,100 +93,88 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) "vlef %%v22,120(%%r1,%2),3 \n\t" "vlef %%v23,124(%%r1,%2),3 \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" + "vlef %%v24,128(%%r1,%2),0 \n\t" + "vlef %%v25,132(%%r1,%2),0 \n\t" + "vlef %%v24,136(%%r1,%2),1 \n\t" + "vlef %%v25,140(%%r1,%2),1 \n\t" + "vlef %%v24,144(%%r1,%2),2 \n\t" + "vlef %%v25,148(%%r1,%2),2 \n\t" + "vlef %%v24,152(%%r1,%2),3 \n\t" + "vlef %%v25,156(%%r1,%2),3 \n\t" + + "vlef %%v26,160(%%r1,%2),0 \n\t" + "vlef %%v27,164(%%r1,%2),0 \n\t" + "vlef %%v26,168(%%r1,%2),1 \n\t" + "vlef %%v27,172(%%r1,%2),1 \n\t" + "vlef %%v26,176(%%r1,%2),2 \n\t" + "vlef %%v27,180(%%r1,%2),2 \n\t" + "vlef %%v26,184(%%r1,%2),3 \n\t" + "vlef %%v27,188(%%r1,%2),3 \n\t" + + "vlef %%v28,192(%%r1,%2),0 \n\t" + "vlef %%v29,196(%%r1,%2),0 \n\t" + "vlef %%v28,200(%%r1,%2),1 \n\t" + "vlef %%v29,204(%%r1,%2),1 \n\t" + "vlef %%v28,208(%%r1,%2),2 \n\t" + "vlef %%v29,212(%%r1,%2),2 \n\t" + "vlef %%v28,216(%%r1,%2),3 \n\t" + "vlef %%v29,220(%%r1,%2),3 \n\t" + + "vlef %%v30,224(%%r1,%2),0 \n\t" + "vlef %%v31,228(%%r1,%2),0 \n\t" + "vlef %%v30,232(%%r1,%2),1 \n\t" + "vlef %%v31,236(%%r1,%2),1 \n\t" + "vlef %%v30,240(%%r1,%2),2 \n\t" + "vlef %%v31,244(%%r1,%2),2 \n\t" + "vlef %%v30,248(%%r1,%2),3 \n\t" + "vlef %%v31,252(%%r1,%2),3 \n\t" + + "vflpsb %%v16,%%v16 \n\t" + "vflpsb %%v17,%%v17 \n\t" + "vflpsb %%v18,%%v18 \n\t" + "vflpsb %%v19,%%v19 \n\t" + "vflpsb %%v20,%%v20 \n\t" + "vflpsb %%v21,%%v21 \n\t" + "vflpsb %%v22,%%v22 \n\t" + "vflpsb %%v23,%%v23 \n\t" + "vflpsb %%v24,%%v24 \n\t" + "vflpsb %%v25,%%v25 \n\t" + "vflpsb %%v26,%%v26 \n\t" + "vflpsb %%v27,%%v27 \n\t" + "vflpsb %%v28,%%v28 \n\t" + "vflpsb %%v29,%%v29 \n\t" + "vflpsb %%v30,%%v30 \n\t" + "vflpsb %%v31,%%v31 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" + "vfasb %%v18,%%v18,%%v19 \n\t" + "vfasb %%v20,%%v20,%%v21 \n\t" + "vfasb %%v22,%%v22,%%v23 \n\t" + "vfasb %%v24,%%v24,%%v25 \n\t" + "vfasb %%v26,%%v26,%%v27 \n\t" + "vfasb %%v28,%%v28,%%v29 \n\t" + "vfasb %%v30,%%v30,%%v31 \n\t" - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vfmaxsb %%v16,%%v16,%%v24,0 \n\t" + "vfmaxsb %%v18,%%v18,%%v26,0 \n\t" + "vfmaxsb %%v20,%%v20,%%v28,0 \n\t" + "vfmaxsb %%v22,%%v22,%%v30,0 \n\t" - "vfchsb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + "vfmaxsb %%v16,%%v16,%%v20,0 \n\t" + "vfmaxsb %%v18,%%v18,%%v22,0 \n\t" - "vfchsb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + "vfmaxsb %%v16,%%v16,%%v18,0 \n\t" - "vlef %%v16,128(%%r1,%2),0 \n\t" - "vlef %%v17,132(%%r1,%2),0 \n\t" - "vlef %%v16,136(%%r1,%2),1 \n\t" - "vlef %%v17,140(%%r1,%2),1 \n\t" - "vlef %%v16,144(%%r1,%2),2 \n\t" - "vlef %%v17,148(%%r1,%2),2 \n\t" - "vlef %%v16,152(%%r1,%2),3 \n\t" - "vlef %%v17,156(%%r1,%2),3 \n\t" - - "vlef %%v18,160(%%r1,%2),0 \n\t" - "vlef %%v19,164(%%r1,%2),0 \n\t" - "vlef %%v18,168(%%r1,%2),1 \n\t" - "vlef %%v19,172(%%r1,%2),1 \n\t" - "vlef %%v18,176(%%r1,%2),2 \n\t" - "vlef %%v19,180(%%r1,%2),2 \n\t" - "vlef %%v18,184(%%r1,%2),3 \n\t" - "vlef %%v19,188(%%r1,%2),3 \n\t" - - "vlef %%v20,192(%%r1,%2),0 \n\t" - "vlef %%v21,196(%%r1,%2),0 \n\t" - "vlef %%v20,200(%%r1,%2),1 \n\t" - "vlef %%v21,204(%%r1,%2),1 \n\t" - "vlef %%v20,208(%%r1,%2),2 \n\t" - "vlef %%v21,212(%%r1,%2),2 \n\t" - "vlef %%v20,216(%%r1,%2),3 \n\t" - "vlef %%v21,220(%%r1,%2),3 \n\t" - - "vlef %%v22,224(%%r1,%2),0 \n\t" - "vlef %%v23,228(%%r1,%2),0 \n\t" - "vlef %%v22,232(%%r1,%2),1 \n\t" - "vlef %%v23,236(%%r1,%2),1 \n\t" - "vlef %%v22,240(%%r1,%2),2 \n\t" - "vlef %%v23,244(%%r1,%2),2 \n\t" - "vlef %%v22,248(%%r1,%2),3 \n\t" - "vlef %%v23,252(%%r1,%2),3 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchsb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchsb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "wfmaxsb %%v0,%%v0,%%v16,0 \n\t" "ler %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -233,11 +221,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { maxf=CABS1(x,0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - BLASLONG n1 = (n - 1) & -4; - while ((i - 1) < n1) { + BLASLONG n1 = n & -4; + while (i < n1) { if (CABS1(x,ix) > maxf) { maxf = CABS1(x,ix); diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 733f98fbf..5abc685b2 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -43,8 +43,8 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) __asm__ volatile ( "vlef %%v0,0(%2),0 \n\t" "vlef %%v16,4(%2),0 \n\t" - "vlef %%v0,8(%2),0 \n\t" - "vlef %%v16,12(%2),0 \n\t" + "vlef %%v0,8(%2),1 \n\t" + "vlef %%v16,12(%2),1 \n\t" "vlef %%v0,16(%2),2 \n\t" "vlef %%v16,20(%2),2 \n\t" "vlef %%v0,24(%2),3 \n\t" @@ -59,8 +59,8 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "vlef %%v16,0(%%r1,%2),0 \n\t" "vlef %%v17,4(%%r1,%2),0 \n\t" - "vlef %%v16,8(%%r1,%2),0 \n\t" - "vlef %%v17,12(%%r1,%2),0 \n\t" + "vlef %%v16,8(%%r1,%2),1 \n\t" + "vlef %%v17,12(%%r1,%2),1 \n\t" "vlef %%v16,16(%%r1,%2),2 \n\t" "vlef %%v17,20(%%r1,%2),2 \n\t" "vlef %%v16,24(%%r1,%2),3 \n\t" @@ -68,8 +68,8 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "vlef %%v18,32(%%r1,%2),0 \n\t" "vlef %%v19,36(%%r1,%2),0 \n\t" - "vlef %%v18,40(%%r1,%2),0 \n\t" - "vlef %%v19,44(%%r1,%2),0 \n\t" + "vlef %%v18,40(%%r1,%2),1 \n\t" + "vlef %%v19,44(%%r1,%2),1 \n\t" "vlef %%v18,48(%%r1,%2),2 \n\t" "vlef %%v19,52(%%r1,%2),2 \n\t" "vlef %%v18,56(%%r1,%2),3 \n\t" @@ -77,8 +77,8 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "vlef %%v20,64(%%r1,%2),0 \n\t" "vlef %%v21,68(%%r1,%2),0 \n\t" - "vlef %%v20,72(%%r1,%2),0 \n\t" - "vlef %%v21,76(%%r1,%2),0 \n\t" + "vlef %%v20,72(%%r1,%2),1 \n\t" + "vlef %%v21,76(%%r1,%2),1 \n\t" "vlef %%v20,80(%%r1,%2),2 \n\t" "vlef %%v21,84(%%r1,%2),2 \n\t" "vlef %%v20,88(%%r1,%2),3 \n\t" @@ -86,107 +86,95 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "vlef %%v22,96(%%r1,%2),0 \n\t" "vlef %%v23,100(%%r1,%2),0 \n\t" - "vlef %%v22,104(%%r1,%2),0 \n\t" - "vlef %%v23,108(%%r1,%2),0 \n\t" + "vlef %%v22,104(%%r1,%2),1 \n\t" + "vlef %%v23,108(%%r1,%2),1 \n\t" "vlef %%v22,112(%%r1,%2),2 \n\t" "vlef %%v23,116(%%r1,%2),2 \n\t" "vlef %%v22,120(%%r1,%2),3 \n\t" "vlef %%v23,124(%%r1,%2),3 \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" + "vlef %%v24,128(%%r1,%2),0 \n\t" + "vlef %%v25,132(%%r1,%2),0 \n\t" + "vlef %%v24,136(%%r1,%2),1 \n\t" + "vlef %%v25,140(%%r1,%2),1 \n\t" + "vlef %%v24,144(%%r1,%2),2 \n\t" + "vlef %%v25,148(%%r1,%2),2 \n\t" + "vlef %%v24,152(%%r1,%2),3 \n\t" + "vlef %%v25,156(%%r1,%2),3 \n\t" + + "vlef %%v26,160(%%r1,%2),0 \n\t" + "vlef %%v27,164(%%r1,%2),0 \n\t" + "vlef %%v26,168(%%r1,%2),1 \n\t" + "vlef %%v27,172(%%r1,%2),1 \n\t" + "vlef %%v26,176(%%r1,%2),2 \n\t" + "vlef %%v27,180(%%r1,%2),2 \n\t" + "vlef %%v26,184(%%r1,%2),3 \n\t" + "vlef %%v27,188(%%r1,%2),3 \n\t" + + "vlef %%v28,192(%%r1,%2),0 \n\t" + "vlef %%v29,196(%%r1,%2),0 \n\t" + "vlef %%v28,200(%%r1,%2),1 \n\t" + "vlef %%v29,204(%%r1,%2),1 \n\t" + "vlef %%v28,208(%%r1,%2),2 \n\t" + "vlef %%v29,212(%%r1,%2),2 \n\t" + "vlef %%v28,216(%%r1,%2),3 \n\t" + "vlef %%v29,220(%%r1,%2),3 \n\t" + + "vlef %%v30,224(%%r1,%2),0 \n\t" + "vlef %%v31,228(%%r1,%2),0 \n\t" + "vlef %%v30,232(%%r1,%2),1 \n\t" + "vlef %%v31,236(%%r1,%2),1 \n\t" + "vlef %%v30,240(%%r1,%2),2 \n\t" + "vlef %%v31,244(%%r1,%2),2 \n\t" + "vlef %%v30,248(%%r1,%2),3 \n\t" + "vlef %%v31,252(%%r1,%2),3 \n\t" + + "vflpsb %%v16,%%v16 \n\t" + "vflpsb %%v17,%%v17 \n\t" + "vflpsb %%v18,%%v18 \n\t" + "vflpsb %%v19,%%v19 \n\t" + "vflpsb %%v20,%%v20 \n\t" + "vflpsb %%v21,%%v21 \n\t" + "vflpsb %%v22,%%v22 \n\t" + "vflpsb %%v23,%%v23 \n\t" + "vflpsb %%v24,%%v24 \n\t" + "vflpsb %%v25,%%v25 \n\t" + "vflpsb %%v26,%%v26 \n\t" + "vflpsb %%v27,%%v27 \n\t" + "vflpsb %%v28,%%v28 \n\t" + "vflpsb %%v29,%%v29 \n\t" + "vflpsb %%v30,%%v30 \n\t" + "vflpsb %%v31,%%v31 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" + "vfasb %%v18,%%v18,%%v19 \n\t" + "vfasb %%v20,%%v20,%%v21 \n\t" + "vfasb %%v22,%%v22,%%v23 \n\t" + "vfasb %%v24,%%v24,%%v25 \n\t" + "vfasb %%v26,%%v26,%%v27 \n\t" + "vfasb %%v28,%%v28,%%v29 \n\t" + "vfasb %%v30,%%v30,%%v31 \n\t" - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vfminsb %%v16,%%v16,%%v24,0 \n\t" + "vfminsb %%v18,%%v18,%%v26,0 \n\t" + "vfminsb %%v20,%%v20,%%v28,0 \n\t" + "vfminsb %%v22,%%v22,%%v30,0 \n\t" - "vfchsb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + "vfminsb %%v16,%%v16,%%v20,0 \n\t" + "vfminsb %%v18,%%v18,%%v22,0 \n\t" - "vfchsb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + "vfminsb %%v16,%%v16,%%v18,0 \n\t" - "vlef %%v16,128(%%r1,%2),0 \n\t" - "vlef %%v17,132(%%r1,%2),0 \n\t" - "vlef %%v16,136(%%r1,%2),0 \n\t" - "vlef %%v17,140(%%r1,%2),0 \n\t" - "vlef %%v16,144(%%r1,%2),2 \n\t" - "vlef %%v17,148(%%r1,%2),2 \n\t" - "vlef %%v16,152(%%r1,%2),3 \n\t" - "vlef %%v17,156(%%r1,%2),3 \n\t" - - "vlef %%v18,160(%%r1,%2),0 \n\t" - "vlef %%v19,164(%%r1,%2),0 \n\t" - "vlef %%v18,168(%%r1,%2),0 \n\t" - "vlef %%v19,172(%%r1,%2),0 \n\t" - "vlef %%v18,176(%%r1,%2),2 \n\t" - "vlef %%v19,180(%%r1,%2),2 \n\t" - "vlef %%v18,184(%%r1,%2),3 \n\t" - "vlef %%v19,188(%%r1,%2),3 \n\t" - - "vlef %%v20,192(%%r1,%2),0 \n\t" - "vlef %%v21,196(%%r1,%2),0 \n\t" - "vlef %%v20,200(%%r1,%2),0 \n\t" - "vlef %%v21,204(%%r1,%2),0 \n\t" - "vlef %%v20,208(%%r1,%2),2 \n\t" - "vlef %%v21,212(%%r1,%2),2 \n\t" - "vlef %%v20,216(%%r1,%2),3 \n\t" - "vlef %%v21,220(%%r1,%2),3 \n\t" - - "vlef %%v22,224(%%r1,%2),0 \n\t" - "vlef %%v23,228(%%r1,%2),0 \n\t" - "vlef %%v22,232(%%r1,%2),0 \n\t" - "vlef %%v23,236(%%r1,%2),0 \n\t" - "vlef %%v22,240(%%r1,%2),2 \n\t" - "vlef %%v23,244(%%r1,%2),2 \n\t" - "vlef %%v22,248(%%r1,%2),3 \n\t" - "vlef %%v23,252(%%r1,%2),3 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchsb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchsb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + "vfminsb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfminsb %%v0,%%v0,%%v16,0 \n\t" "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "wfminsb %%v0,%%v0,%%v16,0 \n\t" "ler %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -233,11 +221,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { minf=CABS1(x,0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - BLASLONG n1 = (n - 1) & -4; - while ((i - 1) < n1) { + BLASLONG n1 = n & -4; + while (i < n1) { if (CABS1(x,ix) < minf) { minf = CABS1(x,ix); diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index 236d11c72..a3d63fe53 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -39,8 +39,7 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) FLOAT amax; __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "vflpdb %%v0,%%v0 \n\t" + "vl %%v0,0(%2) \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" @@ -54,79 +53,42 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + "vfmaxdb %%v16,%%v16,%%v24,8 \n\t" + "vfmaxdb %%v17,%%v17,%%v25,8 \n\t" + "vfmaxdb %%v18,%%v18,%%v26,8 \n\t" + "vfmaxdb %%v19,%%v19,%%v27,8 \n\t" + "vfmaxdb %%v20,%%v20,%%v28,8 \n\t" + "vfmaxdb %%v21,%%v21,%%v29,8 \n\t" + "vfmaxdb %%v22,%%v22,%%v30,8 \n\t" + "vfmaxdb %%v23,%%v23,%%v31,8 \n\t" - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + "vfmaxdb %%v16,%%v16,%%v20,8 \n\t" + "vfmaxdb %%v17,%%v17,%%v21,8 \n\t" + "vfmaxdb %%v18,%%v18,%%v22,8 \n\t" + "vfmaxdb %%v19,%%v19,%%v23,8 \n\t" - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + "vfmaxdb %%v16,%%v16,%%v18,8 \n\t" + "vfmaxdb %%v17,%%v17,%%v19,8 \n\t" - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmaxdb %%v16,%%v16,%%v17,8 \n\t" - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmaxdb %%v0,%%v0,%%16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" - "ldr %0,%%f0 " + "vrepg %%v16,%%v0,1 \n\t" + "wfmaxdb %%v0,%%v0,%%v16,8 \n\t" + "lpdr %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" @@ -168,11 +130,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { maxf=ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) > maxf) { maxf = ABS(x[i]); diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index c2c63c6c5..738ed8710 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -39,11 +39,10 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) FLOAT amin; __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "vflpdb %%v0,%%v0 \n\t" + "vl %%v0,0(%2) \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" "vl %%v16,0(%%r1,%2) \n\t" @@ -54,79 +53,42 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + "vfmindb %%v16,%%v16,%%v24,8 \n\t" + "vfmindb %%v17,%%v17,%%v25,8 \n\t" + "vfmindb %%v18,%%v18,%%v26,8 \n\t" + "vfmindb %%v19,%%v19,%%v27,8 \n\t" + "vfmindb %%v20,%%v20,%%v28,8 \n\t" + "vfmindb %%v21,%%v21,%%v29,8 \n\t" + "vfmindb %%v22,%%v22,%%v30,8 \n\t" + "vfmindb %%v23,%%v23,%%v31,8 \n\t" - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + "vfmindb %%v16,%%v16,%%v20,8 \n\t" + "vfmindb %%v17,%%v17,%%v21,8 \n\t" + "vfmindb %%v18,%%v18,%%v22,8 \n\t" + "vfmindb %%v19,%%v19,%%v23,8 \n\t" - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + "vfmindb %%v16,%%v16,%%v18,8 \n\t" + "vfmindb %%v17,%%v17,%%v19,8 \n\t" - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmindb %%v16,%%v16,%%v17,8 \n\t" - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmindb %%v0,%%v0,%%16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" - "ldr %0,%%f0 " + "vrepg %%v16,%%v0,1 \n\t" + "wfmindb %%v0,%%v0,%%v16,8 \n\t" + "lpdr %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" @@ -168,11 +130,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { minf=ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) < minf) { minf = ABS(x[i]); diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index 469f65735..aa8b932f9 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -32,7 +32,7 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) FLOAT max; __asm__ volatile ( - "vl %%v0,0(%2) \n\t" + "vl %%v0,0(%2) \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" @@ -46,62 +46,41 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + "vfmaxdb %%v16,%%v16,%%v24,0 \n\t" + "vfmaxdb %%v17,%%v17,%%v25,0 \n\t" + "vfmaxdb %%v18,%%v18,%%v26,0 \n\t" + "vfmaxdb %%v19,%%v19,%%v27,0 \n\t" + "vfmaxdb %%v20,%%v20,%%v28,0 \n\t" + "vfmaxdb %%v21,%%v21,%%v29,0 \n\t" + "vfmaxdb %%v22,%%v22,%%v30,0 \n\t" + "vfmaxdb %%v23,%%v23,%%v31,0 \n\t" - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + "vfmaxdb %%v16,%%v16,%%v20,0 \n\t" + "vfmaxdb %%v17,%%v17,%%v21,0 \n\t" + "vfmaxdb %%v18,%%v18,%%v22,0 \n\t" + "vfmaxdb %%v19,%%v19,%%v23,0 \n\t" - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + "vfmaxdb %%v16,%%v16,%%v18,0 \n\t" + "vfmaxdb %%v17,%%v17,%%v19,0 \n\t" - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmaxdb %%v16,%%v16,%%v17,0 \n\t" - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmaxdb %%v0,%%v0,%%16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vrepg %%v16,%%v0,1 \n\t" + "wfmaxdb %%v0,%%v0,%%v16,0 \n\t" "ldr %0,%%f0 " :"=f"(max) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -144,11 +123,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { maxf=x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] > maxf) { maxf = x[i]; diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index 3df504950..8ae5fe868 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -32,7 +32,7 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) FLOAT min; __asm__ volatile ( - "vl %%v0,0(%2) \n\t" + "vl %%v0,0(%2) \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" @@ -46,62 +46,41 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + "vfmindb %%v16,%%v16,%%v24,0 \n\t" + "vfmindb %%v17,%%v17,%%v25,0 \n\t" + "vfmindb %%v18,%%v18,%%v26,0 \n\t" + "vfmindb %%v19,%%v19,%%v27,0 \n\t" + "vfmindb %%v20,%%v20,%%v28,0 \n\t" + "vfmindb %%v21,%%v21,%%v29,0 \n\t" + "vfmindb %%v22,%%v22,%%v30,0 \n\t" + "vfmindb %%v23,%%v23,%%v31,0 \n\t" - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + "vfmindb %%v16,%%v16,%%v20,0 \n\t" + "vfmindb %%v17,%%v17,%%v21,0 \n\t" + "vfmindb %%v18,%%v18,%%v22,0 \n\t" + "vfmindb %%v19,%%v19,%%v23,0 \n\t" - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + "vfmindb %%v16,%%v16,%%v18,0 \n\t" + "vfmindb %%v17,%%v17,%%v19,0 \n\t" - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmindb %%v16,%%v16,%%v17,0 \n\t" - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmindb %%v0,%%v0,%%16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vrepg %%v16,%%v0,1 \n\t" + "wfmindb %%v0,%%v0,%%v16,0 \n\t" "ldr %0,%%f0 " :"=f"(min) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -144,11 +123,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { minf=x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] < minf) { minf = x[i]; diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 9b4077c6b..27f969eee 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -76,7 +76,7 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" - "pfd 1, 1024(%3) \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" "vlef %%v16,0(%%r1,%3),0 \n\t" "vlef %%v17,4(%%r1,%3),0 \n\t" @@ -127,14 +127,14 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vfasb %%v18,%%v20,%%v21 \n\t" "vfasb %%v19,%%v22,%%v23 \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -142,13 +142,13 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vlef %%v16,128(%%r1,%3),0 \n\t" @@ -200,14 +200,14 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vfasb %%v18,%%v20,%%v21 \n\t" "vfasb %%v19,%%v22,%%v23 \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -215,13 +215,13 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -250,8 +250,8 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "wfchsb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamax),"=m"(*amax) @@ -302,6 +302,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + max = 0; maxf = CABS1(x,0); inc_x2 = 2 * inc_x; ix += inc_x2; diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 6e952a325..ae7b37b4f 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -127,14 +127,14 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vfasb %%v18,%%v20,%%v21 \n\t" "vfasb %%v19,%%v22,%%v23 \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -142,13 +142,13 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vlef %%v16,128(%%r1,%3),0 \n\t" @@ -200,14 +200,14 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vfasb %%v18,%%v20,%%v21 \n\t" "vfasb %%v19,%%v22,%%v23 \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -215,13 +215,13 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -250,8 +250,8 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "wfchsb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamin),"=m"(*amin) @@ -302,6 +302,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + min = 0; minf = CABS1(x,0); inc_x2 = 2 * inc_x; ix += inc_x2; diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index 4f7ff6985..e5a1d3a7c 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -63,7 +63,7 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vleig %%v31,15,1 \n\t" "srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -83,10 +83,10 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vflpdb %%v22, %%v22 \n\t" "vflpdb %%v23, %%v23 \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" - "vfchdb %%v6,%%v20,%%v21 \n\t" - "vfchdb %%v7,%%v22,%%v23 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v6,%%v20,%%v21 \n\t" + "vfchedb %%v7,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -96,21 +96,21 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v16,%%v17 \n\t" - "vfchdb %%v21,%%v18,%%v19 \n\t" + "vfchedb %%v20,%%v16,%%v17 \n\t" + "vfchedb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -130,10 +130,10 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vflpdb %%v22, %%v22 \n\t" "vflpdb %%v23, %%v23 \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" - "vfchdb %%v6,%%v20,%%v21 \n\t" - "vfchdb %%v7,%%v22,%%v23 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v6,%%v20,%%v21 \n\t" + "vfchedb %%v7,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -143,21 +143,21 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v16,%%v17 \n\t" - "vfchdb %%v21,%%v18,%%v19 \n\t" + "vfchedb %%v20,%%v16,%%v17 \n\t" + "vfchedb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -175,8 +175,8 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "wfchdb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamax),"=m"(*amax) @@ -221,12 +221,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) > maxf) { max = j; diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 3abc7a558..a68f7282f 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -63,7 +63,7 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vleig %%v31,15,1 \n\t" "srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -83,10 +83,10 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vflpdb %%v22, %%v22 \n\t" "vflpdb %%v23, %%v23 \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" - "vfchdb %%v6,%%v21,%%v20 \n\t" - "vfchdb %%v7,%%v23,%%v22 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v6,%%v21,%%v20 \n\t" + "vfchedb %%v7,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -96,21 +96,21 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v17,%%v16 \n\t" - "vfchdb %%v21,%%v19,%%v18 \n\t" + "vfchedb %%v20,%%v17,%%v16 \n\t" + "vfchedb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -130,10 +130,10 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vflpdb %%v22, %%v22 \n\t" "vflpdb %%v23, %%v23 \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" - "vfchdb %%v6,%%v21,%%v20 \n\t" - "vfchdb %%v7,%%v23,%%v22 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v6,%%v21,%%v20 \n\t" + "vfchedb %%v7,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -143,21 +143,21 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v17,%%v16 \n\t" - "vfchdb %%v21,%%v19,%%v18 \n\t" + "vfchedb %%v20,%%v17,%%v16 \n\t" + "vfchedb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -175,8 +175,8 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "wfchdb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamin),"=m"(*amin) @@ -221,12 +221,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) < minf) { min = j; diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 313a88db4..4c3040779 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -55,7 +55,7 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "vleig %%v31,15,1 \n\t" "srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -67,10 +67,10 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%3) \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" - "vfchdb %%v6,%%v20,%%v21 \n\t" - "vfchdb %%v7,%%v22,%%v23 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v6,%%v20,%%v21 \n\t" + "vfchedb %%v7,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -80,21 +80,21 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v16,%%v17 \n\t" - "vfchdb %%v21,%%v18,%%v19 \n\t" + "vfchedb %%v20,%%v16,%%v17 \n\t" + "vfchedb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -106,10 +106,10 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%3) \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" - "vfchdb %%v6,%%v20,%%v21 \n\t" - "vfchdb %%v7,%%v22,%%v23 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v6,%%v20,%%v21 \n\t" + "vfchedb %%v7,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -119,21 +119,21 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v16,%%v17 \n\t" - "vfchdb %%v21,%%v18,%%v19 \n\t" + "vfchedb %%v20,%%v16,%%v17 \n\t" + "vfchedb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -151,8 +151,8 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "wfchdb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(imax),"=m"(*max) @@ -197,12 +197,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] > maxf) { max = j; diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index 42443215b..ba1776a49 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -55,7 +55,7 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "vleig %%v31,15,1 \n\t" "srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -67,10 +67,10 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%3) \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" - "vfchdb %%v6,%%v21,%%v20 \n\t" - "vfchdb %%v7,%%v23,%%v22 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v6,%%v21,%%v20 \n\t" + "vfchedb %%v7,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -80,21 +80,21 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v17,%%v16 \n\t" - "vfchdb %%v21,%%v19,%%v18 \n\t" + "vfchedb %%v20,%%v17,%%v16 \n\t" + "vfchedb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -106,10 +106,10 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%3) \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" - "vfchdb %%v6,%%v21,%%v20 \n\t" - "vfchdb %%v7,%%v23,%%v22 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v6,%%v21,%%v20 \n\t" + "vfchedb %%v7,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -119,21 +119,21 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v17,%%v16 \n\t" - "vfchdb %%v21,%%v19,%%v18 \n\t" + "vfchedb %%v20,%%v17,%%v16 \n\t" + "vfchedb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -151,8 +151,8 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "wfchdb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(imin),"=m"(*min) @@ -197,12 +197,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] < minf) { min = j; diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index dd2144db2..2f5c1c867 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -81,7 +81,7 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vleif %%v31,31,3 \n\t" "srlg %%r0,%2,6 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -101,10 +101,10 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vflpsb %%v22, %%v22 \n\t" "vflpsb %%v23, %%v23 \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" - "vfchsb %%v7,%%v20,%%v21 \n\t" - "vfchsb %%v8,%%v22,%%v23 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v7,%%v20,%%v21 \n\t" + "vfchesb %%v8,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -114,14 +114,14 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v16,%%v17 \n\t" - "vfchsb %%v21,%%v18,%%v19 \n\t" + "vfchesb %%v20,%%v16,%%v17 \n\t" + "vfchesb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -129,13 +129,13 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -155,10 +155,10 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vflpsb %%v22, %%v22 \n\t" "vflpsb %%v23, %%v23 \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" - "vfchsb %%v7,%%v20,%%v21 \n\t" - "vfchsb %%v8,%%v22,%%v23 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v7,%%v20,%%v21 \n\t" + "vfchesb %%v8,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -168,14 +168,14 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v16,%%v17 \n\t" - "vfchsb %%v21,%%v18,%%v19 \n\t" + "vfchesb %%v20,%%v16,%%v17 \n\t" + "vfchesb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -183,13 +183,13 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -218,8 +218,8 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "wfchsb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamax),"=m"(*amax) @@ -264,12 +264,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) > maxf) { max = j; diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index d7e44421d..04e05aad9 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -81,7 +81,7 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vleif %%v31,31,3 \n\t" "srlg %%r0,%2,6 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -101,10 +101,10 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vflpsb %%v22, %%v22 \n\t" "vflpsb %%v23, %%v23 \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" - "vfchsb %%v7,%%v21,%%v20 \n\t" - "vfchsb %%v8,%%v23,%%v22 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v7,%%v21,%%v20 \n\t" + "vfchesb %%v8,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -114,14 +114,14 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v17,%%v16 \n\t" - "vfchsb %%v21,%%v19,%%v18 \n\t" + "vfchesb %%v20,%%v17,%%v16 \n\t" + "vfchesb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -129,13 +129,13 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -155,10 +155,10 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vflpsb %%v22, %%v22 \n\t" "vflpsb %%v23, %%v23 \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" - "vfchsb %%v7,%%v21,%%v20 \n\t" - "vfchsb %%v8,%%v23,%%v22 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v7,%%v21,%%v20 \n\t" + "vfchesb %%v8,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -168,14 +168,14 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v17,%%v16 \n\t" - "vfchsb %%v21,%%v19,%%v18 \n\t" + "vfchesb %%v20,%%v17,%%v16 \n\t" + "vfchesb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -183,13 +183,13 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -218,8 +218,8 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "wfchsb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamin),"=m"(*amin) @@ -264,12 +264,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) < minf) { min = j; diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 1ebc6c8c8..084b4ce94 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -73,7 +73,7 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vleif %%v31,31,3 \n\t" "srlg %%r0,%2,6 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -85,10 +85,10 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%3) \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" - "vfchsb %%v7,%%v20,%%v21 \n\t" - "vfchsb %%v8,%%v22,%%v23 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v7,%%v20,%%v21 \n\t" + "vfchesb %%v8,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -98,14 +98,14 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v16,%%v17 \n\t" - "vfchsb %%v21,%%v18,%%v19 \n\t" + "vfchesb %%v20,%%v16,%%v17 \n\t" + "vfchesb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -113,13 +113,13 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -131,10 +131,10 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%3) \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" - "vfchsb %%v7,%%v20,%%v21 \n\t" - "vfchsb %%v8,%%v22,%%v23 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v7,%%v20,%%v21 \n\t" + "vfchesb %%v8,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -144,14 +144,14 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v16,%%v17 \n\t" - "vfchsb %%v21,%%v18,%%v19 \n\t" + "vfchesb %%v20,%%v16,%%v17 \n\t" + "vfchesb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -159,13 +159,13 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -194,8 +194,8 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "wfchsb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(imax),"=m"(*max) @@ -240,12 +240,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] > maxf) { max = j; diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index a6b9d59de..4e85816a3 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -73,7 +73,7 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vleif %%v31,31,3 \n\t" "srlg %%r0,%2,6 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -85,10 +85,10 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%3) \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" - "vfchsb %%v7,%%v21,%%v20 \n\t" - "vfchsb %%v8,%%v23,%%v22 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v7,%%v21,%%v20 \n\t" + "vfchesb %%v8,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -98,14 +98,14 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v17,%%v16 \n\t" - "vfchsb %%v21,%%v19,%%v18 \n\t" + "vfchesb %%v20,%%v17,%%v16 \n\t" + "vfchesb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -113,13 +113,13 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -131,10 +131,10 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%3) \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" - "vfchsb %%v7,%%v21,%%v20 \n\t" - "vfchsb %%v8,%%v23,%%v22 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v7,%%v21,%%v20 \n\t" + "vfchesb %%v8,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -144,14 +144,14 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v17,%%v16 \n\t" - "vfchsb %%v21,%%v19,%%v18 \n\t" + "vfchesb %%v20,%%v17,%%v16 \n\t" + "vfchesb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -159,13 +159,13 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -194,8 +194,8 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "wfchsb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(imin),"=m"(*min) @@ -240,12 +240,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] < minf) { min = j; diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 541464b05..2ffad2570 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -93,21 +93,21 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) "vfadb %%v18,%%v20,%%v21 \n\t" "vfadb %%v19,%%v22,%%v23 \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vleg %%v16,128(%%r1,%3),0 \n\t" @@ -139,21 +139,21 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) "vfadb %%v18,%%v20,%%v21 \n\t" "vfadb %%v19,%%v22,%%v23 \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -171,8 +171,8 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) "wfchdb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamax),"=m"(*amax) @@ -223,6 +223,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + max = 0; maxf = CABS1(x,0); inc_x2 = 2 * inc_x; ix += inc_x2; diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 4b5572b80..1e037c0c7 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -93,21 +93,21 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) "vfadb %%v18,%%v20,%%v21 \n\t" "vfadb %%v19,%%v22,%%v23 \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vleg %%v16,128(%%r1,%3),0 \n\t" @@ -139,21 +139,21 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) "vfadb %%v18,%%v20,%%v21 \n\t" "vfadb %%v19,%%v22,%%v23 \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -171,8 +171,8 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) "wfchdb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamin),"=m"(*amin) @@ -223,6 +223,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + min = 0; minf = CABS1(x,0); inc_x2 = 2 * inc_x; ix += inc_x2; diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c index 61d50159f..c8d831d06 100644 --- a/kernel/zarch/samax.c +++ b/kernel/zarch/samax.c @@ -40,8 +40,7 @@ static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) __asm__ volatile ( "vl %%v0,0(%2) \n\t" - "vflpsb %%v0,%%v0 \n\t" - "srlg %%r0,%1,6 \n\t" + "srlg %%r0,%1,6 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" @@ -54,83 +53,45 @@ static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vfchsb %%v26,%%v20,%%v21 \n\t" - "vfchsb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + "vfmaxsb %%v16,%%v16,%%v24,8 \n\t" + "vfmaxsb %%v17,%%v17,%%v25,8 \n\t" + "vfmaxsb %%v18,%%v18,%%v26,8 \n\t" + "vfmaxsb %%v19,%%v19,%%v27,8 \n\t" + "vfmaxsb %%v20,%%v20,%%v28,8 \n\t" + "vfmaxsb %%v21,%%v21,%%v29,8 \n\t" + "vfmaxsb %%v22,%%v22,%%v30,8 \n\t" + "vfmaxsb %%v23,%%v23,%%v31,8 \n\t" - "vfchsb %%v28,%%v24,%%v25 \n\t" - "vfchsb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + "vfmaxsb %%v16,%%v16,%%v20,8 \n\t" + "vfmaxsb %%v17,%%v17,%%v21,8 \n\t" + "vfmaxsb %%v18,%%v18,%%v22,8 \n\t" + "vfmaxsb %%v19,%%v19,%%v23,8 \n\t" - "vfchsb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + "vfmaxsb %%v16,%%v16,%%v18,8 \n\t" + "vfmaxsb %%v17,%%v17,%%v19,8 \n\t" - "vfchsb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmaxsb %%v16,%%v16,%%v17,8 \n\t" - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vfchsb %%v26,%%v20,%%v21 \n\t" - "vfchsb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v24,%%v25 \n\t" - "vfchsb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmaxsb %%v0,%%v0,%%16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfmaxsb %%v0,%%v0,%%v16,8 \n\t" - "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" - "ler %0,%%f0 " + "vrepf %%v16,%%v0,2 \n\t" + "wfmaxsb %%v0,%%v0,%%v16,8 \n\t" + "lper %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" @@ -172,11 +133,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { maxf=ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) > maxf) { maxf = ABS(x[i]); diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c index a585a79ff..dd24c74d7 100644 --- a/kernel/zarch/samin.c +++ b/kernel/zarch/samin.c @@ -40,8 +40,7 @@ static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) __asm__ volatile ( "vl %%v0,0(%2) \n\t" - "vflpsb %%v0,%%v0 \n\t" - "srlg %%r0,%1,6 \n\t" + "srlg %%r0,%1,6 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" @@ -54,83 +53,45 @@ static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vfchsb %%v26,%%v21,%%v20 \n\t" - "vfchsb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + "vfminsb %%v16,%%v16,%%v24,8 \n\t" + "vfminsb %%v17,%%v17,%%v25,8 \n\t" + "vfminsb %%v18,%%v18,%%v26,8 \n\t" + "vfminsb %%v19,%%v19,%%v27,8 \n\t" + "vfminsb %%v20,%%v20,%%v28,8 \n\t" + "vfminsb %%v21,%%v21,%%v29,8 \n\t" + "vfminsb %%v22,%%v22,%%v30,8 \n\t" + "vfminsb %%v23,%%v23,%%v31,8 \n\t" - "vfchsb %%v28,%%v25,%%v24 \n\t" - "vfchsb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + "vfminsb %%v16,%%v16,%%v20,8 \n\t" + "vfminsb %%v17,%%v17,%%v21,8 \n\t" + "vfminsb %%v18,%%v18,%%v22,8 \n\t" + "vfminsb %%v19,%%v19,%%v23,8 \n\t" - "vfchsb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + "vfminsb %%v16,%%v16,%%v18,8 \n\t" + "vfminsb %%v17,%%v17,%%v19,8 \n\t" - "vfchsb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfminsb %%v16,%%v16,%%v17,8 \n\t" - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vfchsb %%v26,%%v21,%%v20 \n\t" - "vfchsb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v25,%%v24 \n\t" - "vfchsb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfminsb %%v0,%%v0,%%16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfminsb %%v0,%%v0,%%v16,8 \n\t" - "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" - "ler %0,%%f0 " + "vrepf %%v16,%%v0,2 \n\t" + "wfminsb %%v0,%%v0,%%v16,8 \n\t" + "lper %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" @@ -172,11 +133,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { minf=ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) < minf) { minf = ABS(x[i]); diff --git a/kernel/zarch/smax.c b/kernel/zarch/smax.c index bcdb473af..8a2b86dc1 100644 --- a/kernel/zarch/smax.c +++ b/kernel/zarch/smax.c @@ -33,7 +33,7 @@ static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) __asm__ volatile ( "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,6 \n\t" + "srlg %%r0,%1,6 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" @@ -46,66 +46,44 @@ static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vfchsb %%v26,%%v20,%%v21 \n\t" - "vfchsb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + "vfmaxsb %%v16,%%v16,%%v24,0 \n\t" + "vfmaxsb %%v17,%%v17,%%v25,0 \n\t" + "vfmaxsb %%v18,%%v18,%%v26,0 \n\t" + "vfmaxsb %%v19,%%v19,%%v27,0 \n\t" + "vfmaxsb %%v20,%%v20,%%v28,0 \n\t" + "vfmaxsb %%v21,%%v21,%%v29,0 \n\t" + "vfmaxsb %%v22,%%v22,%%v30,0 \n\t" + "vfmaxsb %%v23,%%v23,%%v31,0 \n\t" - "vfchsb %%v28,%%v24,%%v25 \n\t" - "vfchsb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + "vfmaxsb %%v16,%%v16,%%v20,0 \n\t" + "vfmaxsb %%v17,%%v17,%%v21,0 \n\t" + "vfmaxsb %%v18,%%v18,%%v22,0 \n\t" + "vfmaxsb %%v19,%%v19,%%v23,0 \n\t" - "vfchsb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + "vfmaxsb %%v16,%%v16,%%v18,0 \n\t" + "vfmaxsb %%v17,%%v17,%%v19,0 \n\t" - "vfchsb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmaxsb %%v16,%%v16,%%v17,0 \n\t" - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vfchsb %%v26,%%v20,%%v21 \n\t" - "vfchsb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v24,%%v25 \n\t" - "vfchsb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmaxsb %%v0,%%v0,%%16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" - "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vrepf %%v16,%%v0,2 \n\t" + "wfmaxsb %%v0,%%v0,%%v16,0 \n\t" "ler %0,%%f0 " :"=f"(max) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -148,11 +126,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { maxf=x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] > maxf) { maxf = x[i]; diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c index 91c31d284..b87ec0fe8 100644 --- a/kernel/zarch/smin.c +++ b/kernel/zarch/smin.c @@ -33,7 +33,7 @@ static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) __asm__ volatile ( "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,6 \n\t" + "srlg %%r0,%1,6 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" @@ -46,66 +46,44 @@ static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vfchsb %%v26,%%v21,%%v20 \n\t" - "vfchsb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + "vfminsb %%v16,%%v16,%%v24,0 \n\t" + "vfminsb %%v17,%%v17,%%v25,0 \n\t" + "vfminsb %%v18,%%v18,%%v26,0 \n\t" + "vfminsb %%v19,%%v19,%%v27,0 \n\t" + "vfminsb %%v20,%%v20,%%v28,0 \n\t" + "vfminsb %%v21,%%v21,%%v29,0 \n\t" + "vfminsb %%v22,%%v22,%%v30,0 \n\t" + "vfminsb %%v23,%%v23,%%v31,0 \n\t" - "vfchsb %%v28,%%v25,%%v24 \n\t" - "vfchsb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + "vfminsb %%v16,%%v16,%%v20,0 \n\t" + "vfminsb %%v17,%%v17,%%v21,0 \n\t" + "vfminsb %%v18,%%v18,%%v22,0 \n\t" + "vfminsb %%v19,%%v19,%%v23,0 \n\t" - "vfchsb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + "vfminsb %%v16,%%v16,%%v18,0 \n\t" + "vfminsb %%v17,%%v17,%%v19,0 \n\t" - "vfchsb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfminsb %%v16,%%v16,%%v17,0 \n\t" - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vfchsb %%v26,%%v21,%%v20 \n\t" - "vfchsb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v25,%%v24 \n\t" - "vfchsb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfminsb %%v0,%%v0,%%16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfminsb %%v0,%%v0,%%v16,0 \n\t" - "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vrepf %%v16,%%v0,2 \n\t" + "wfminsb %%v0,%%v0,%%v16,0 \n\t" "ler %0,%%f0 " :"=f"(min) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -148,11 +126,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { minf=x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] < minf) { minf = x[i]; diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index 8ef3f42ca..8175874c0 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -69,76 +69,66 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) "vleg %%v23,104(%%r1,%2),0 \n\t" "vleg %%v22,112(%%r1,%2),1 \n\t" "vleg %%v23,120(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" + "vleg %%v24,128(%%r1,%2),0 \n\t" + "vleg %%v25,136(%%r1,%2),0 \n\t" + "vleg %%v24,144(%%r1,%2),1 \n\t" + "vleg %%v25,152(%%r1,%2),1 \n\t" + "vleg %%v26,160(%%r1,%2),0 \n\t" + "vleg %%v27,168(%%r1,%2),0 \n\t" + "vleg %%v26,176(%%r1,%2),1 \n\t" + "vleg %%v27,184(%%r1,%2),1 \n\t" + "vleg %%v28,192(%%r1,%2),0 \n\t" + "vleg %%v29,200(%%r1,%2),0 \n\t" + "vleg %%v28,208(%%r1,%2),1 \n\t" + "vleg %%v29,216(%%r1,%2),1 \n\t" + "vleg %%v30,224(%%r1,%2),0 \n\t" + "vleg %%v31,232(%%r1,%2),0 \n\t" + "vleg %%v30,240(%%r1,%2),1 \n\t" + "vleg %%v31,248(%%r1,%2),1 \n\t" + + "vflpdb %%v16,%%v16 \n\t" + "vflpdb %%v17,%%v17 \n\t" + "vflpdb %%v18,%%v18 \n\t" + "vflpdb %%v19,%%v19 \n\t" + "vflpdb %%v20,%%v20 \n\t" + "vflpdb %%v21,%%v21 \n\t" + "vflpdb %%v22,%%v22 \n\t" + "vflpdb %%v23,%%v23 \n\t" + "vflpdb %%v24,%%v24 \n\t" + "vflpdb %%v25,%%v25 \n\t" + "vflpdb %%v26,%%v26 \n\t" + "vflpdb %%v27,%%v27 \n\t" + "vflpdb %%v28,%%v28 \n\t" + "vflpdb %%v29,%%v29 \n\t" + "vflpdb %%v30,%%v30 \n\t" + "vflpdb %%v31,%%v31 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" + "vfadb %%v18,%%v18,%%v19 \n\t" + "vfadb %%v20,%%v20,%%v21 \n\t" + "vfadb %%v22,%%v22,%%v23 \n\t" + "vfadb %%v24,%%v24,%%v25 \n\t" + "vfadb %%v26,%%v26,%%v27 \n\t" + "vfadb %%v28,%%v28,%%v29 \n\t" + "vfadb %%v30,%%v30,%%v31 \n\t" - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vfmaxdb %%v16,%%v16,%%v24,0 \n\t" + "vfmaxdb %%v18,%%v18,%%v26,0 \n\t" + "vfmaxdb %%v20,%%v20,%%v28,0 \n\t" + "vfmaxdb %%v22,%%v22,%%v30,0 \n\t" - "vfchdb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + "vfmaxdb %%v16,%%v16,%%v20,0 \n\t" + "vfmaxdb %%v18,%%v18,%%v22,0 \n\t" - "vfchdb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + "vfmaxdb %%v16,%%v16,%%v18,0 \n\t" - "vleg %%v16,128(%%r1,%2),0 \n\t" - "vleg %%v17,136(%%r1,%2),0 \n\t" - "vleg %%v16,144(%%r1,%2),1 \n\t" - "vleg %%v17,152(%%r1,%2),1 \n\t" - "vleg %%v18,160(%%r1,%2),0 \n\t" - "vleg %%v19,168(%%r1,%2),0 \n\t" - "vleg %%v18,176(%%r1,%2),1 \n\t" - "vleg %%v19,184(%%r1,%2),1 \n\t" - "vleg %%v20,192(%%r1,%2),0 \n\t" - "vleg %%v21,200(%%r1,%2),0 \n\t" - "vleg %%v20,208(%%r1,%2),1 \n\t" - "vleg %%v21,216(%%r1,%2),1 \n\t" - "vleg %%v22,224(%%r1,%2),0 \n\t" - "vleg %%v23,232(%%r1,%2),0 \n\t" - "vleg %%v22,240(%%r1,%2),1 \n\t" - "vleg %%v23,248(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + "vfmaxdb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "wfmaxdb %%v0,%%v0,%%v16,0 \n\t" "ldr %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -185,11 +175,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { maxf=CABS1(x,0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - BLASLONG n1 = (n - 1) & -4; - while ((i - 1) < n1) { + BLASLONG n1 = n & -4; + while (i < n1) { if (CABS1(x,ix) > maxf) { maxf = CABS1(x,ix); diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index 30fd1d030..5d57ff12e 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -69,76 +69,66 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) "vleg %%v23,104(%%r1,%2),0 \n\t" "vleg %%v22,112(%%r1,%2),1 \n\t" "vleg %%v23,120(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" + "vleg %%v24,128(%%r1,%2),0 \n\t" + "vleg %%v25,136(%%r1,%2),0 \n\t" + "vleg %%v24,144(%%r1,%2),1 \n\t" + "vleg %%v25,152(%%r1,%2),1 \n\t" + "vleg %%v26,160(%%r1,%2),0 \n\t" + "vleg %%v27,168(%%r1,%2),0 \n\t" + "vleg %%v26,176(%%r1,%2),1 \n\t" + "vleg %%v27,184(%%r1,%2),1 \n\t" + "vleg %%v28,192(%%r1,%2),0 \n\t" + "vleg %%v29,200(%%r1,%2),0 \n\t" + "vleg %%v28,208(%%r1,%2),1 \n\t" + "vleg %%v29,216(%%r1,%2),1 \n\t" + "vleg %%v30,224(%%r1,%2),0 \n\t" + "vleg %%v31,232(%%r1,%2),0 \n\t" + "vleg %%v30,240(%%r1,%2),1 \n\t" + "vleg %%v31,248(%%r1,%2),1 \n\t" + + "vflpdb %%v16,%%v16 \n\t" + "vflpdb %%v17,%%v17 \n\t" + "vflpdb %%v18,%%v18 \n\t" + "vflpdb %%v19,%%v19 \n\t" + "vflpdb %%v20,%%v20 \n\t" + "vflpdb %%v21,%%v21 \n\t" + "vflpdb %%v22,%%v22 \n\t" + "vflpdb %%v23,%%v23 \n\t" + "vflpdb %%v24,%%v24 \n\t" + "vflpdb %%v25,%%v25 \n\t" + "vflpdb %%v26,%%v26 \n\t" + "vflpdb %%v27,%%v27 \n\t" + "vflpdb %%v28,%%v28 \n\t" + "vflpdb %%v29,%%v29 \n\t" + "vflpdb %%v30,%%v30 \n\t" + "vflpdb %%v31,%%v31 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" + "vfadb %%v18,%%v18,%%v19 \n\t" + "vfadb %%v20,%%v20,%%v21 \n\t" + "vfadb %%v22,%%v22,%%v23 \n\t" + "vfadb %%v24,%%v24,%%v25 \n\t" + "vfadb %%v26,%%v26,%%v27 \n\t" + "vfadb %%v28,%%v28,%%v29 \n\t" + "vfadb %%v30,%%v30,%%v31 \n\t" - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vfmindb %%v16,%%v16,%%v24,0 \n\t" + "vfmindb %%v18,%%v18,%%v26,0 \n\t" + "vfmindb %%v20,%%v20,%%v28,0 \n\t" + "vfmindb %%v22,%%v22,%%v30,0 \n\t" - "vfchdb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + "vfmindb %%v16,%%v16,%%v20,0 \n\t" + "vfmindb %%v18,%%v18,%%v22,0 \n\t" - "vfchdb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + "vfmindb %%v16,%%v16,%%v18,0 \n\t" - "vleg %%v16,128(%%r1,%2),0 \n\t" - "vleg %%v17,136(%%r1,%2),0 \n\t" - "vleg %%v16,144(%%r1,%2),1 \n\t" - "vleg %%v17,152(%%r1,%2),1 \n\t" - "vleg %%v18,160(%%r1,%2),0 \n\t" - "vleg %%v19,168(%%r1,%2),0 \n\t" - "vleg %%v18,176(%%r1,%2),1 \n\t" - "vleg %%v19,184(%%r1,%2),1 \n\t" - "vleg %%v20,192(%%r1,%2),0 \n\t" - "vleg %%v21,200(%%r1,%2),0 \n\t" - "vleg %%v20,208(%%r1,%2),1 \n\t" - "vleg %%v21,216(%%r1,%2),1 \n\t" - "vleg %%v22,224(%%r1,%2),0 \n\t" - "vleg %%v23,232(%%r1,%2),0 \n\t" - "vleg %%v22,240(%%r1,%2),1 \n\t" - "vleg %%v23,248(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + "vfmindb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "wfmindb %%v0,%%v0,%%v16,0 \n\t" "ldr %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -185,11 +175,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { minf=CABS1(x,0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - BLASLONG n1 = (n - 1) & -4; - while ((i - 1) < n1) { + BLASLONG n1 = n & -4; + while (i < n1) { if (CABS1(x,ix) < minf) { minf = CABS1(x,ix); From c8ef9fb22064dc6cb1c7515ad8d7e25c7adf9a8a Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 28 Jan 2019 17:16:18 +0200 Subject: [PATCH 19/26] [ZARCH] Fix bug in iamax/iamin/imax/imin --- kernel/zarch/icamax.c | 1 + kernel/zarch/icamin.c | 1 + kernel/zarch/idamax.c | 1 + kernel/zarch/idamin.c | 1 + kernel/zarch/idmax.c | 1 + kernel/zarch/idmin.c | 1 + kernel/zarch/isamax.c | 1 + kernel/zarch/isamin.c | 1 + kernel/zarch/ismax.c | 1 + kernel/zarch/ismin.c | 1 + kernel/zarch/izamax.c | 1 + kernel/zarch/izamin.c | 1 + 12 files changed, 12 insertions(+) diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 27f969eee..96cb37a1d 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -283,6 +283,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + max = 0; maxf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index ae7b37b4f..73bd9e8de 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -283,6 +283,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + min = 0; minf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index e5a1d3a7c..4a0114242 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -206,6 +206,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = ABS(x[0]); i++; } diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index a68f7282f..503f92ff7 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -206,6 +206,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = ABS(x[0]); i++; } diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 4c3040779..871c896e6 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -182,6 +182,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = x[0]; i++; } diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index ba1776a49..dd14ec92c 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -182,6 +182,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = x[0]; i++; } diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index 2f5c1c867..1a9ac3cd8 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -249,6 +249,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = ABS(x[0]); i++; } diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 04e05aad9..5a7e669eb 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -249,6 +249,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = ABS(x[0]); i++; } diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 084b4ce94..0b144c200 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -225,6 +225,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = x[0]; i++; } diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index 4e85816a3..7fda9dffc 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -225,6 +225,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = x[0]; i++; } diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 2ffad2570..7db64181c 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -204,6 +204,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + max = 0; maxf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 1e037c0c7..707d702d3 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -204,6 +204,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + min = 0; minf = CABS1(x,0); ix += 2; i++; From 04873bb174d45a9cac478d7db7fd6f2618df2e81 Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 28 Jan 2019 17:32:24 +0200 Subject: [PATCH 20/26] [ZARCH] Undo the last commit --- kernel/zarch/icamax.c | 1 - kernel/zarch/icamin.c | 1 - kernel/zarch/idamax.c | 1 - kernel/zarch/idamin.c | 1 - kernel/zarch/idmax.c | 1 - kernel/zarch/idmin.c | 1 - kernel/zarch/isamax.c | 1 - kernel/zarch/isamin.c | 1 - kernel/zarch/ismax.c | 1 - kernel/zarch/ismin.c | 1 - kernel/zarch/izamax.c | 1 - kernel/zarch/izamin.c | 1 - 12 files changed, 12 deletions(-) diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 96cb37a1d..27f969eee 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -283,7 +283,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - max = 0; maxf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 73bd9e8de..ae7b37b4f 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -283,7 +283,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - min = 0; minf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index 4a0114242..e5a1d3a7c 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -206,7 +206,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - max = 0; maxf = ABS(x[0]); i++; } diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 503f92ff7..a68f7282f 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -206,7 +206,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - min = 0; minf = ABS(x[0]); i++; } diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 871c896e6..4c3040779 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -182,7 +182,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - max = 0; maxf = x[0]; i++; } diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index dd14ec92c..ba1776a49 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -182,7 +182,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - min = 0; minf = x[0]; i++; } diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index 1a9ac3cd8..2f5c1c867 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -249,7 +249,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - max = 0; maxf = ABS(x[0]); i++; } diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 5a7e669eb..04e05aad9 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -249,7 +249,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - min = 0; minf = ABS(x[0]); i++; } diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 0b144c200..084b4ce94 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -225,7 +225,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - max = 0; maxf = x[0]; i++; } diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index 7fda9dffc..4e85816a3 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -225,7 +225,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - min = 0; minf = x[0]; i++; } diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 7db64181c..2ffad2570 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -204,7 +204,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - max = 0; maxf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 707d702d3..1e037c0c7 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -204,7 +204,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - min = 0; minf = CABS1(x,0); ix += 2; i++; From c7143c1019d7a35f94454e2ac811cd948a41d22e Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 28 Jan 2019 17:52:23 +0200 Subject: [PATCH 21/26] [ZARCH] Fix iamax/imax single precision --- kernel/zarch/icamax.c | 2 ++ kernel/zarch/icamin.c | 2 ++ kernel/zarch/isamax.c | 2 ++ kernel/zarch/isamin.c | 2 ++ kernel/zarch/ismax.c | 2 ++ kernel/zarch/ismin.c | 2 ++ 6 files changed, 12 insertions(+) diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 27f969eee..2d1442ad9 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -248,6 +248,8 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v2,%%v0 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index ae7b37b4f..79aa6d341 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -248,6 +248,8 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v0,%%v2 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index 2f5c1c867..6e0aaa162 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -216,6 +216,8 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v2,%%v0 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 04e05aad9..266c48f7f 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -216,6 +216,8 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v0,%%v2 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 084b4ce94..c968ce6fa 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -192,6 +192,8 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v2,%%v0 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index 4e85816a3..0145b31b3 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -192,6 +192,8 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v0,%%v2 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" From dc4d3bccd5ee7de7bb823aa0bb7008a04bcc21d4 Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 29 Jan 2019 03:47:49 +0200 Subject: [PATCH 22/26] [ZARCH] Fix icamax/icamin --- kernel/zarch/icamax.c | 2 +- kernel/zarch/icamin.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 2d1442ad9..113c0cef5 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -94,7 +94,7 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vlef %%v18,48(%%r1,%3),2 \n\t" "vlef %%v19,52(%%r1,%3),2 \n\t" "vlef %%v18,56(%%r1,%3),3 \n\t" - "vlef %%v19,30(%%r1,%3),3 \n\t" + "vlef %%v19,60(%%r1,%3),3 \n\t" "vlef %%v20,64(%%r1,%3),0 \n\t" "vlef %%v21,68(%%r1,%3),0 \n\t" diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 79aa6d341..5096b641b 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -94,7 +94,7 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vlef %%v18,48(%%r1,%3),2 \n\t" "vlef %%v19,52(%%r1,%3),2 \n\t" "vlef %%v18,56(%%r1,%3),3 \n\t" - "vlef %%v19,30(%%r1,%3),3 \n\t" + "vlef %%v19,60(%%r1,%3),3 \n\t" "vlef %%v20,64(%%r1,%3),0 \n\t" "vlef %%v21,68(%%r1,%3),0 \n\t" From fcd814a8d292b7712a4230d9b9a20f0f2ce0fe52 Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 29 Jan 2019 17:59:38 +0200 Subject: [PATCH 23/26] [ZARCH] Fix bug in max/min functions --- kernel/zarch/camax.c | 2 +- kernel/zarch/camin.c | 2 +- kernel/zarch/icamax.c | 2 +- kernel/zarch/icamin.c | 2 +- kernel/zarch/izamax.c | 2 +- kernel/zarch/izamin.c | 2 +- kernel/zarch/zamax.c | 2 +- kernel/zarch/zamin.c | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 66d250896..f6fa772ac 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -198,7 +198,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (n1 > 0) { maxf = camax_kernel_32(n1, x); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 5abc685b2..4bd6ca17d 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -198,7 +198,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (n1 > 0) { minf = camin_kernel_32(n1, x); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 113c0cef5..a9e7f91fc 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -280,7 +280,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n1 > 0) { max = icamax_kernel_32(n1, x, &maxf); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 5096b641b..faf5f9c65 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -280,7 +280,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n1 > 0) { min = icamin_kernel_32(n1, x, &minf); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 2ffad2570..2d1cc2365 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -199,7 +199,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n1 > 0) { max = izamax_kernel_16(n1, x, &maxf); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 1e037c0c7..676fd7c6d 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -199,7 +199,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n1 > 0) { min = izamin_kernel_16(n1, x, &minf); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index 8175874c0..b7214783f 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -152,7 +152,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (n1 > 0) { maxf = zamax_kernel_16(n1, x); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index 5d57ff12e..d53fdb6b8 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -152,7 +152,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (n1 > 0) { minf = zamin_kernel_16(n1, x); - + ix = n1 * 2; i = n1; } else From eaf20f0e7ac8c2ab53deeb78f959bebb2a49cddd Mon Sep 17 00:00:00 2001 From: maamountki Date: Thu, 31 Jan 2019 09:26:50 +0200 Subject: [PATCH 24/26] Remove ztest --- ztest/Makefile | 437 ---------------------------------- ztest/amax.c | 235 ------------------ ztest/amin.c | 235 ------------------ ztest/asum.c | 263 -------------------- ztest/axpy.c | 303 ----------------------- ztest/copy.c | 291 ----------------------- ztest/dot.c | 296 ----------------------- ztest/dsdot.c | 229 ------------------ ztest/gemv.c | 633 ------------------------------------------------- ztest/iamax.c | 284 ---------------------- ztest/iamin.c | 284 ---------------------- ztest/imax.c | 231 ------------------ ztest/imin.c | 231 ------------------ ztest/max.c | 229 ------------------ ztest/min.c | 229 ------------------ ztest/rot.c | 303 ----------------------- ztest/scal.c | 308 ------------------------ ztest/swap.c | 306 ------------------------ 18 files changed, 5327 deletions(-) delete mode 100644 ztest/Makefile delete mode 100644 ztest/amax.c delete mode 100644 ztest/amin.c delete mode 100644 ztest/asum.c delete mode 100644 ztest/axpy.c delete mode 100644 ztest/copy.c delete mode 100644 ztest/dot.c delete mode 100644 ztest/dsdot.c delete mode 100644 ztest/gemv.c delete mode 100644 ztest/iamax.c delete mode 100644 ztest/iamin.c delete mode 100644 ztest/imax.c delete mode 100644 ztest/imin.c delete mode 100644 ztest/max.c delete mode 100644 ztest/min.c delete mode 100644 ztest/rot.c delete mode 100644 ztest/scal.c delete mode 100644 ztest/swap.c diff --git a/ztest/Makefile b/ztest/Makefile deleted file mode 100644 index 0ff7fe46a..000000000 --- a/ztest/Makefile +++ /dev/null @@ -1,437 +0,0 @@ -TOPDIR = .. -include $(TOPDIR)/Makefile.system - -goto :: sdot.goto ddot.goto cdot.goto zdot.goto dsdot.goto sswap.goto dswap.goto cswap.goto zswap.goto isamax.goto idamax.goto icamax.goto izamax.goto samax.goto damax.goto ismax.goto idmax.goto smax.goto dmax.goto isamin.goto idamin.goto icamin.goto izamin.goto samin.goto damin.goto camin.goto zamin.goto ismin.goto idmin.goto smin.goto dmin.goto sgemv.goto dgemv.goto cgemv.goto zgemv.goto sscal.goto dscal.goto cscal.goto zscal.goto saxpy.goto daxpy.goto caxpy.goto zaxpy.goto srot.goto drot.goto crot.goto zrot.goto sasum.goto dasum.goto casum.goto zasum.goto scopy.goto dcopy.goto ccopy.goto zcopy.goto - -##################################### Sdot #################################################### -sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Ddot #################################################### -ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Cdot #################################################### -cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zdot #################################################### -zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dsdot #################################################### -dsdot.goto : dsdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ISAMAX ############################################## -isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDAMAX ############################################## -idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ICAMAX ############################################## -icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IZAMAX ############################################## -izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SAMAX ############################################## -samax.goto : samax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DAMAX ############################################## -damax.goto : damax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ISMAX ############################################## -ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDMAX ############################################## -idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SMAX ############################################## -smax.goto : smax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DMAX ############################################## -dmax.goto : dmax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ISAMIN ############################################## -isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDAMIN ############################################## -idamin.goto : idamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ICAMIN ############################################## -icamin.goto : icamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IZAMIN ############################################## -izamin.goto : izamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SAMIN ############################################## -samin.goto : samin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DAMIN ############################################## -damin.goto : damin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## CAMIN ############################################## -camin.goto : camin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ZAMIN ############################################## -zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ISMIN ############################################## -ismin.goto : ismin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDMIN ############################################## -idmin.goto : idmin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SMIN ############################################## -smin.goto : smin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DMIN ############################################## -dmin.goto : dmin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Sgemv #################################################### -sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dgemv #################################################### -dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Cgemv #################################################### - -cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zgemv #################################################### - -zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Sscal #################################################### -sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dscal #################################################### -dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Cscal #################################################### - -cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zscal #################################################### - -zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Saxpy #################################################### -saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Daxpy #################################################### -daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Caxpy #################################################### - -caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zaxpy #################################################### - -zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Srot #################################################### -srot.goto : srot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Drot #################################################### -drot.goto : drot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Crot #################################################### -crot.goto : crot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zrot #################################################### -zrot.goto : zrot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Sswap #################################################### -sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dswap #################################################### -dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Cswap #################################################### - -cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zswap #################################################### - -zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Saxpy #################################################### -saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Daxpy #################################################### -daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Caxpy #################################################### - -caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zaxpy #################################################### - -zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Sasum #################################################### -sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dasum #################################################### -dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Casum #################################################### - -casum.goto : casum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zasum #################################################### - -zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Scopy #################################################### -scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dcopy #################################################### -dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Ccopy #################################################### - -ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zcopy #################################################### - -zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -################################################################################################### - -sdot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -ddot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cdot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zdot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -dsdot.$(SUFFIX) : dsdot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -isamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -icamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -izamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -samax.$(SUFFIX) : amax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -damax.$(SUFFIX) : amax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ismax.$(SUFFIX) : imax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idmax.$(SUFFIX) : imax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -smax.$(SUFFIX) : max.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dmax.$(SUFFIX) : max.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -isamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -icamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -izamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -samin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -damin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -camin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zamin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -ismin.$(SUFFIX) : imin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idmin.$(SUFFIX) : imin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -smin.$(SUFFIX) : min.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dmin.$(SUFFIX) : min.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -sgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -saxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -daxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -caxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zaxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -srot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -drot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -crot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zrot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -saxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -daxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -caxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zaxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -casum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -scopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dcopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ccopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zcopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -clean :: - @rm -f *.goto - diff --git a/ztest/amax.c b/ztest/amax.c deleted file mode 100644 index f2e3f5411..000000000 --- a/ztest/amax.c +++ /dev/null @@ -1,235 +0,0 @@ -/*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -FLOAT amax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT maxf=0.0; - - if (n <= 0 || inc_x <= 0) return(maxf); - - maxf=ABS(x[0]); - ix += inc_x; - i++; - - while(i < n) - { - if( ABS(x[ix]) > maxf ) - { - maxf = ABS(x[ix]); - } - ix += inc_x; - i++; - } - return(maxf); -} - -#undef AMAX -#ifdef DOUBLE -#define AMAX BLASFUNC(damax) -#else -#define AMAX BLASFUNC(samax) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - FLOAT result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -FLOAT amin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf=0.0; - - if (n <= 0 || inc_x <= 0) return(minf); - - minf=ABS(x[0]); - ix += inc_x; - i++; - - while(i < n) - { - if( ABS(x[ix]) < minf ) - { - minf = ABS(x[ix]); - } - ix += inc_x; - i++; - } - return(minf); -} - -#undef AMIN -#ifdef DOUBLE -#define AMIN BLASFUNC(damin) -#else -#define AMIN BLASFUNC(samin) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - FLOAT result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#ifdef COMPLEX -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) -FLOAT zasum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - FLOAT sumf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return(sumf); - - inc_x2 = 2 * inc_x; - - n *= inc_x2; - while(i < n) - { - sumf += CABS1(x,i); - i += inc_x2; - } - return(sumf); -} -#else -FLOAT asum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - FLOAT sumf = 0.0; - if (n <= 0 || inc_x <= 0) return(sumf); - - n *= inc_x; - while(i < n) - { - sumf += ABS(x[i]); - i += inc_x; - } - return(sumf); -} -#endif - -#undef ASUM -#ifdef COMPLEX -#ifdef DOUBLE -#define ASUM BLASFUNC(dzasum) -#else -#define ASUM BLASFUNC(scasum) -#endif -#else -#ifdef DOUBLE -#define ASUM BLASFUNC(dasum) -#else -#define ASUM BLASFUNC(sasum) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - FLOAT result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zaxpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix,iy; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n < 0 ) return(0); - if ( da_r == 0.0 && da_i == 0.0 ) return(0); - - ix = 0; - iy = 0; - - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; - - while(i < n) - { -#if !defined(CONJ) - y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; - y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; -#else - y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; - y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; -#endif - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - - } - return(0); - -} -#else -int axpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix,iy; - - if ( n < 0 ) return(0); - if ( da == 0.0 ) return(0); - - ix = 0; - iy = 0; - - while(i < n) - { - - y[iy] += da * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(0); - -} -#endif - -#undef AXPY -#ifdef COMPLEX -#ifdef DOUBLE -#define AXPY BLASFUNC(zaxpy) -#else -#define AXPY BLASFUNC(caxpy) -#endif -#else -#ifdef DOUBLE -#define AXPY BLASFUNC(daxpy) -#else -#define AXPY BLASFUNC(saxpy) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y, *y_c;; - FLOAT alpha[2] = { 2.0, 2.0 }; - blasint m, i; - blasint inc_x=1,inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - argc--;argv++; - - blasint iy; - int test = 1; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zcopy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n < 0 ) return(0); - - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; - - while(i < n) - { - - y[iy] = x[ix] ; - y[iy+1] = x[ix+1] ; - ix += inc_x2; - iy += inc_y2; - i++ ; - - } - return(0); - -} -#else -int copy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - - if ( n < 0 ) return(0); - - while(i < n) - { - - y[iy] = x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(0); - -} -#endif - -#undef COPY -#ifdef COMPLEX -#ifdef DOUBLE -#define COPY BLASFUNC(zcopy) -#else -#define COPY BLASFUNC(ccopy) -#endif -#else -#ifdef DOUBLE -#define COPY BLASFUNC(dcopy) -#else -#define COPY BLASFUNC(scopy) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y, *y_c; - blasint m, i; - blasint inc_x=1,inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - blasint iy; - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -OPENBLAS_COMPLEX_FLOAT zdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT dot[2]; - OPENBLAS_COMPLEX_FLOAT result; - BLASLONG inc_x2; - BLASLONG inc_y2; - - dot[0]=0.0; - dot[1]=0.0; - - CREAL(result) = 0.0 ; - CIMAG(result) = 0.0 ; - - if ( n < 1 ) return(result); - - inc_x2 = 2 * inc_x ; - inc_y2 = 2 * inc_y ; - - while(i < n) - { -#if !defined(CONJ) - dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ; - dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ; -#else - dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ; - dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ; -#endif - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - - } - CREAL(result) = dot[0]; - CIMAG(result) = dot[1]; - return(result); - -} -#else -FLOAT dot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT dot = 0.0 ; - - if ( n < 0 ) return(dot); - - while(i < n) - { - - dot += y[iy] * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(dot); -} -#endif - -#undef DOT -#ifdef COMPLEX -#ifdef DOUBLE -#define DOT BLASFUNC(zdotu) -#else -#define DOT BLASFUNC(cdotu) -#endif -#else -#ifdef DOUBLE -#define DOT BLASFUNC(ddot) -#else -#define DOT BLASFUNC(sdot) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y; -#ifdef COMPLEX - OPENBLAS_COMPLEX_FLOAT result, result_c; -#else - FLOAT result, result_c; -#endif - blasint m, i; - blasint inc_x=1,inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -double dsdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - double dot = 0.0 ; - - if ( n < 0 ) return(dot); - - while(i < n) - { - - dot += y[iy] * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(dot); -} - -#undef DSDOT -#define DSDOT BLASFUNC(dsdot) - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y; - double result, result_c; - blasint m, i; - blasint inc_x=1,inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zgemv_n_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i; - BLASLONG ix, iy; - BLASLONG j; - FLOAT *a_ptr; - FLOAT temp_r, temp_i; - BLASLONG inc_x2, inc_y2; - BLASLONG lda2; - BLASLONG i2; - - lda2 = 2 * lda; - - ix = 0; - a_ptr = a; - - if (inc_x == 1 && inc_y == 1) - { - - for (j = 0; jtv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *a, *x, *y, *y_c; - FLOAT alpha[] = {1.0, 1.0}; - FLOAT beta [] = {1.0, 0.0}; - char trans='N'; - blasint m, i, j; - blasint inc_x=1,inc_y=1; - blasint n=0; - int has_param_n = 0; - int has_param_m = 0; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - blasint y_size; - blasint iy; - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - - int tomax = to; - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; - if ((p = getenv("OPENBLAS_PARAM_N"))) { - n = atoi(p); - if ((n>0)) has_param_n = 1; - if ( n > tomax ) tomax = n; - } - if ( has_param_n == 0 ) - if ((p = getenv("OPENBLAS_PARAM_M"))) { - m = atoi(p); - if ((m>0)) has_param_m = 1; - if ( m > tomax ) tomax = m; - } - - - - fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops); - - if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - if (has_param_m == 0) - { - - for(m = from; m <= to; m += step) - { - timeg=0; - timeg_c=0; - if ( has_param_n == 0 ) n = m; - fprintf(stderr, " %6dx%d :", (int)m,(int)n); - for(j = 0; j < m; j++){ - for(i = 0; i < n * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - } - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#ifdef COMPLEX -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) -BLASLONG izamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT maxf; - BLASLONG max=0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return(max); - - inc_x2 = 2 * inc_x; - - maxf = CABS1(x,0); - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return(max+1); -} -#else -BLASLONG iamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT maxf=0.0; - BLASLONG max=0; - - if (n <= 0 || inc_x <= 0) return(max); - - maxf=ABS(x[0]); - ix += inc_x; - i++; - - while(i < n) - { - if( ABS(x[ix]) > maxf ) - { - max = i; - maxf = ABS(x[ix]); - } - ix += inc_x; - i++; - } - return(max+1); -} -#endif - -#undef IAMAX -#ifdef COMPLEX -#ifdef DOUBLE -#define IAMAX BLASFUNC(izamax) -#else -#define IAMAX BLASFUNC(icamax) -#endif -#else -#ifdef DOUBLE -#define IAMAX BLASFUNC(idamax) -#else -#define IAMAX BLASFUNC(isamax) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - BLASLONG result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#ifdef COMPLEX -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) -BLASLONG izamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf; - BLASLONG min=0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return(min); - - inc_x2 = 2 * inc_x; - - minf = CABS1(x,0); - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return(min+1); -} -#else -BLASLONG iamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf=0.0; - BLASLONG min=0; - - if (n <= 0 || inc_x <= 0) return(min); - - minf=ABS(x[0]); - ix += inc_x; - i++; - - while(i < n) - { - if( ABS(x[ix]) < minf ) - { - min = i; - minf = ABS(x[ix]); - } - ix += inc_x; - i++; - } - return(min+1); -} -#endif - -#undef IAMIN -#ifdef COMPLEX -#ifdef DOUBLE -#define IAMIN BLASFUNC(izamin) -#else -#define IAMIN BLASFUNC(icamin) -#endif -#else -#ifdef DOUBLE -#define IAMIN BLASFUNC(idamin) -#else -#define IAMIN BLASFUNC(isamin) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - BLASLONG result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -BLASLONG imax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT maxf=0.0; - BLASLONG max=0; - - if (n <= 0 || inc_x <= 0) return(max); - - maxf=x[0]; - ix += inc_x; - i++; - - while(i < n) - { - if( x[ix] > maxf ) - { - max = i; - maxf = x[ix]; - } - ix += inc_x; - i++; - } - return(max+1); -} - -#undef IMAX -#ifdef DOUBLE -#define IMAX BLASFUNC(idmax) -#else -#define IMAX BLASFUNC(ismax) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - BLASLONG result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -BLASLONG imin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf=0.0; - BLASLONG min=0; - - if (n <= 0 || inc_x <= 0) return(min); - - minf=x[0]; - ix += inc_x; - i++; - - while(i < n) - { - if( x[ix] < minf ) - { - min = i; - minf = x[ix]; - } - ix += inc_x; - i++; - } - return(min+1); -} - -#undef IMIN -#ifdef DOUBLE -#define IMIN BLASFUNC(idmin) -#else -#define IMIN BLASFUNC(ismin) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - BLASLONG result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -FLOAT max_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT maxf=0.0; - - if (n <= 0 || inc_x <= 0) return(maxf); - - maxf=x[0]; - ix += inc_x; - i++; - - while(i < n) - { - if( x[ix] > maxf ) - { - maxf = x[ix]; - } - ix += inc_x; - i++; - } - return(maxf); -} - -#undef MAX_ -#ifdef DOUBLE -#define MAX_ BLASFUNC(dmax) -#else -#define MAX_ BLASFUNC(smax) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - FLOAT result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -FLOAT min_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf=0.0; - - if (n <= 0 || inc_x <= 0) return(minf); - - minf=x[0]; - ix += inc_x; - i++; - - while(i < n) - { - if( x[ix] < minf ) - { - minf = x[ix]; - } - ix += inc_x; - i++; - } - return(minf); -} - -#undef MIN_ -#ifdef DOUBLE -#define MIN_ BLASFUNC(dmin) -#else -#define MIN_ BLASFUNC(smin) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - FLOAT result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zrot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n <= 0 ) return(0); - - inc_x2 = 2 * inc_x ; - inc_y2 = 2 * inc_y ; - - while(i < n) - { - temp[0] = c*x[ix] + s*y[iy] ; - temp[1] = c*x[ix+1] + s*y[iy+1] ; - y[iy] = c*y[iy] - s*x[ix] ; - y[iy+1] = c*y[iy+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - - } - return(0); -} -#else -int rot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp; - - if ( n <= 0 ) return(0); - - while(i < n) - { - temp = c*x[ix] + s*y[iy] ; - y[iy] = c*y[iy] - s*x[ix] ; - x[ix] = temp ; - - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(0); -} -#endif - -#undef ROT -#ifdef COMPLEX -#ifdef DOUBLE -#define ROT BLASFUNC(zdrot) -#else -#define ROT BLASFUNC(csrot) -#endif -#else -#ifdef DOUBLE -#define ROT BLASFUNC(drot) -#else -#define ROT BLASFUNC(srot) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y, *x_c, *y_c; - // FLOAT result; - blasint m, i; - blasint inc_x=1,inc_y=1; - FLOAT c[1] = { 2.0 }; - FLOAT s[1] = { 2.0 }; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - blasint ix,iy; - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zscal_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG inc_x2; - BLASLONG ip = 0; - FLOAT temp; - - if ( (n <= 0) || (inc_x <= 0)) - return(0); - - inc_x2 = 2 * inc_x; - for ( i=0; itv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *x_c; - FLOAT alpha[2] = { 2.0, 2.0 }; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - blasint ix; - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zswap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n < 0 ) return(0); - - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; - - while(i < n) - { - - temp[0] = x[ix] ; - temp[1] = x[ix+1] ; - x[ix] = y[iy] ; - x[ix+1] = y[iy+1] ; - y[iy] = temp[0] ; - y[iy+1] = temp[1] ; - - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - - } - return(0); -} -#else -int swap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp; - - if ( n < 0 ) return(0); - - while(i < n) - { - - temp = x[ix] ; - x[ix] = y[iy] ; - y[iy] = temp ; - - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(0); -} -#endif - -#undef SWAP -#ifdef COMPLEX -#ifdef DOUBLE -#define SWAP BLASFUNC(zswap) -#else -#define SWAP BLASFUNC(cswap) -#endif -#else -#ifdef DOUBLE -#define SWAP BLASFUNC(dswap) -#else -#define SWAP BLASFUNC(sswap) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y, *x_c, *y_c; - blasint m, i; - blasint inc_x=1,inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - blasint ix,iy; - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l Date: Thu, 31 Jan 2019 18:52:11 +0200 Subject: [PATCH 25/26] [ZARCH] Improve loading performance for camax/icamax --- kernel/zarch/camax.c | 114 ++++++++++++++++++------------------------ kernel/zarch/camin.c | 114 ++++++++++++++++++------------------------ kernel/zarch/icamax.c | 114 ++++++++++++++++++------------------------ kernel/zarch/icamin.c | 114 ++++++++++++++++++------------------------ kernel/zarch/zamax.c | 2 +- kernel/zarch/zamin.c | 2 +- 6 files changed, 198 insertions(+), 262 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index f6fa772ac..2e9648640 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -52,82 +52,66 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) "vflpsb %%v0,%%v0 \n\t" "vflpsb %%v16,%%v16 \n\t" "vfasb %%v0,%%v0,%%v16 \n\t" + "vleib %%v1,0,0 \n\t" + "vleib %%v1,1,1 \n\t" + "vleib %%v1,2,2 \n\t" + "vleib %%v1,3,3 \n\t" + "vleib %%v1,8,4 \n\t" + "vleib %%v1,9,5 \n\t" + "vleib %%v1,10,6 \n\t" + "vleib %%v1,11,7 \n\t" + "vleib %%v1,16,8 \n\t" + "vleib %%v1,17,9 \n\t" + "vleib %%v1,18,10 \n\t" + "vleib %%v1,19,11 \n\t" + "vleib %%v1,24,12 \n\t" + "vleib %%v1,25,13 \n\t" + "vleib %%v1,26,14 \n\t" + "vleib %%v1,27,15 \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" - "vlef %%v16,0(%%r1,%2),0 \n\t" - "vlef %%v17,4(%%r1,%2),0 \n\t" - "vlef %%v16,8(%%r1,%2),1 \n\t" - "vlef %%v17,12(%%r1,%2),1 \n\t" - "vlef %%v16,16(%%r1,%2),2 \n\t" - "vlef %%v17,20(%%r1,%2),2 \n\t" - "vlef %%v16,24(%%r1,%2),3 \n\t" - "vlef %%v17,28(%%r1,%2),3 \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v2,16(%%r1,%2) \n\t" + "vpkg %%v17,%%v16,%%v2 \n\t" + "vperm %%v16,%%v16,%%v2,%%v1 \n\t" - "vlef %%v18,32(%%r1,%2),0 \n\t" - "vlef %%v19,36(%%r1,%2),0 \n\t" - "vlef %%v18,40(%%r1,%2),1 \n\t" - "vlef %%v19,44(%%r1,%2),1 \n\t" - "vlef %%v18,48(%%r1,%2),2 \n\t" - "vlef %%v19,52(%%r1,%2),2 \n\t" - "vlef %%v18,56(%%r1,%2),3 \n\t" - "vlef %%v19,30(%%r1,%2),3 \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v2,48(%%r1,%2) \n\t" + "vpkg %%v19,%%v18,%%v2 \n\t" + "vperm %%v18,%%v18,%%v2,%%v1 \n\t" - "vlef %%v20,64(%%r1,%2),0 \n\t" - "vlef %%v21,68(%%r1,%2),0 \n\t" - "vlef %%v20,72(%%r1,%2),1 \n\t" - "vlef %%v21,76(%%r1,%2),1 \n\t" - "vlef %%v20,80(%%r1,%2),2 \n\t" - "vlef %%v21,84(%%r1,%2),2 \n\t" - "vlef %%v20,88(%%r1,%2),3 \n\t" - "vlef %%v21,92(%%r1,%2),3 \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v2,80(%%r1,%2) \n\t" + "vpkg %%v21,%%v20,%%v2 \n\t" + "vperm %%v20,%%v20,%%v2,%%v1 \n\t" - "vlef %%v22,96(%%r1,%2),0 \n\t" - "vlef %%v23,100(%%r1,%2),0 \n\t" - "vlef %%v22,104(%%r1,%2),1 \n\t" - "vlef %%v23,108(%%r1,%2),1 \n\t" - "vlef %%v22,112(%%r1,%2),2 \n\t" - "vlef %%v23,116(%%r1,%2),2 \n\t" - "vlef %%v22,120(%%r1,%2),3 \n\t" - "vlef %%v23,124(%%r1,%2),3 \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v2,112(%%r1,%2) \n\t" + "vpkg %%v23,%%v22,%%v2 \n\t" + "vperm %%v22,%%v22,%%v2,%%v1 \n\t" - "vlef %%v24,128(%%r1,%2),0 \n\t" - "vlef %%v25,132(%%r1,%2),0 \n\t" - "vlef %%v24,136(%%r1,%2),1 \n\t" - "vlef %%v25,140(%%r1,%2),1 \n\t" - "vlef %%v24,144(%%r1,%2),2 \n\t" - "vlef %%v25,148(%%r1,%2),2 \n\t" - "vlef %%v24,152(%%r1,%2),3 \n\t" - "vlef %%v25,156(%%r1,%2),3 \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v2,144(%%r1,%2) \n\t" + "vpkg %%v25,%%v24,%%v2 \n\t" + "vperm %%v24,%%v24,%%v2,%%v1 \n\t" - "vlef %%v26,160(%%r1,%2),0 \n\t" - "vlef %%v27,164(%%r1,%2),0 \n\t" - "vlef %%v26,168(%%r1,%2),1 \n\t" - "vlef %%v27,172(%%r1,%2),1 \n\t" - "vlef %%v26,176(%%r1,%2),2 \n\t" - "vlef %%v27,180(%%r1,%2),2 \n\t" - "vlef %%v26,184(%%r1,%2),3 \n\t" - "vlef %%v27,188(%%r1,%2),3 \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v2,176(%%r1,%2) \n\t" + "vpkg %%v27,%%v26,%%v2 \n\t" + "vperm %%v26,%%v26,%%v2,%%v1 \n\t" - "vlef %%v28,192(%%r1,%2),0 \n\t" - "vlef %%v29,196(%%r1,%2),0 \n\t" - "vlef %%v28,200(%%r1,%2),1 \n\t" - "vlef %%v29,204(%%r1,%2),1 \n\t" - "vlef %%v28,208(%%r1,%2),2 \n\t" - "vlef %%v29,212(%%r1,%2),2 \n\t" - "vlef %%v28,216(%%r1,%2),3 \n\t" - "vlef %%v29,220(%%r1,%2),3 \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v2,208(%%r1,%2) \n\t" + "vpkg %%v29,%%v28,%%v2 \n\t" + "vperm %%v28,%%v28,%%v2,%%v1 \n\t" - "vlef %%v30,224(%%r1,%2),0 \n\t" - "vlef %%v31,228(%%r1,%2),0 \n\t" - "vlef %%v30,232(%%r1,%2),1 \n\t" - "vlef %%v31,236(%%r1,%2),1 \n\t" - "vlef %%v30,240(%%r1,%2),2 \n\t" - "vlef %%v31,244(%%r1,%2),2 \n\t" - "vlef %%v30,248(%%r1,%2),3 \n\t" - "vlef %%v31,252(%%r1,%2),3 \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v2,240(%%r1,%2) \n\t" + "vpkg %%v31,%%v30,%%v2 \n\t" + "vperm %%v30,%%v30,%%v2,%%v1 \n\t" "vflpsb %%v16,%%v16 \n\t" "vflpsb %%v17,%%v17 \n\t" @@ -178,7 +162,7 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) "ler %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return amax; diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 4bd6ca17d..aec59058e 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -52,82 +52,66 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "vflpsb %%v0,%%v0 \n\t" "vflpsb %%v16,%%v16 \n\t" "vfasb %%v0,%%v0,%%v16 \n\t" + "vleib %%v1,0,0 \n\t" + "vleib %%v1,1,1 \n\t" + "vleib %%v1,2,2 \n\t" + "vleib %%v1,3,3 \n\t" + "vleib %%v1,8,4 \n\t" + "vleib %%v1,9,5 \n\t" + "vleib %%v1,10,6 \n\t" + "vleib %%v1,11,7 \n\t" + "vleib %%v1,16,8 \n\t" + "vleib %%v1,17,9 \n\t" + "vleib %%v1,18,10 \n\t" + "vleib %%v1,19,11 \n\t" + "vleib %%v1,24,12 \n\t" + "vleib %%v1,25,13 \n\t" + "vleib %%v1,26,14 \n\t" + "vleib %%v1,27,15 \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" - "vlef %%v16,0(%%r1,%2),0 \n\t" - "vlef %%v17,4(%%r1,%2),0 \n\t" - "vlef %%v16,8(%%r1,%2),1 \n\t" - "vlef %%v17,12(%%r1,%2),1 \n\t" - "vlef %%v16,16(%%r1,%2),2 \n\t" - "vlef %%v17,20(%%r1,%2),2 \n\t" - "vlef %%v16,24(%%r1,%2),3 \n\t" - "vlef %%v17,28(%%r1,%2),3 \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v2,16(%%r1,%2) \n\t" + "vpkg %%v17,%%v16,%%v2 \n\t" + "vperm %%v16,%%v16,%%v2,%%v1 \n\t" - "vlef %%v18,32(%%r1,%2),0 \n\t" - "vlef %%v19,36(%%r1,%2),0 \n\t" - "vlef %%v18,40(%%r1,%2),1 \n\t" - "vlef %%v19,44(%%r1,%2),1 \n\t" - "vlef %%v18,48(%%r1,%2),2 \n\t" - "vlef %%v19,52(%%r1,%2),2 \n\t" - "vlef %%v18,56(%%r1,%2),3 \n\t" - "vlef %%v19,30(%%r1,%2),3 \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v2,48(%%r1,%2) \n\t" + "vpkg %%v19,%%v18,%%v2 \n\t" + "vperm %%v18,%%v18,%%v2,%%v1 \n\t" - "vlef %%v20,64(%%r1,%2),0 \n\t" - "vlef %%v21,68(%%r1,%2),0 \n\t" - "vlef %%v20,72(%%r1,%2),1 \n\t" - "vlef %%v21,76(%%r1,%2),1 \n\t" - "vlef %%v20,80(%%r1,%2),2 \n\t" - "vlef %%v21,84(%%r1,%2),2 \n\t" - "vlef %%v20,88(%%r1,%2),3 \n\t" - "vlef %%v21,92(%%r1,%2),3 \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v2,80(%%r1,%2) \n\t" + "vpkg %%v21,%%v20,%%v2 \n\t" + "vperm %%v20,%%v20,%%v2,%%v1 \n\t" - "vlef %%v22,96(%%r1,%2),0 \n\t" - "vlef %%v23,100(%%r1,%2),0 \n\t" - "vlef %%v22,104(%%r1,%2),1 \n\t" - "vlef %%v23,108(%%r1,%2),1 \n\t" - "vlef %%v22,112(%%r1,%2),2 \n\t" - "vlef %%v23,116(%%r1,%2),2 \n\t" - "vlef %%v22,120(%%r1,%2),3 \n\t" - "vlef %%v23,124(%%r1,%2),3 \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v2,112(%%r1,%2) \n\t" + "vpkg %%v23,%%v22,%%v2 \n\t" + "vperm %%v22,%%v22,%%v2,%%v1 \n\t" - "vlef %%v24,128(%%r1,%2),0 \n\t" - "vlef %%v25,132(%%r1,%2),0 \n\t" - "vlef %%v24,136(%%r1,%2),1 \n\t" - "vlef %%v25,140(%%r1,%2),1 \n\t" - "vlef %%v24,144(%%r1,%2),2 \n\t" - "vlef %%v25,148(%%r1,%2),2 \n\t" - "vlef %%v24,152(%%r1,%2),3 \n\t" - "vlef %%v25,156(%%r1,%2),3 \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v2,144(%%r1,%2) \n\t" + "vpkg %%v25,%%v24,%%v2 \n\t" + "vperm %%v24,%%v24,%%v2,%%v1 \n\t" - "vlef %%v26,160(%%r1,%2),0 \n\t" - "vlef %%v27,164(%%r1,%2),0 \n\t" - "vlef %%v26,168(%%r1,%2),1 \n\t" - "vlef %%v27,172(%%r1,%2),1 \n\t" - "vlef %%v26,176(%%r1,%2),2 \n\t" - "vlef %%v27,180(%%r1,%2),2 \n\t" - "vlef %%v26,184(%%r1,%2),3 \n\t" - "vlef %%v27,188(%%r1,%2),3 \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v2,176(%%r1,%2) \n\t" + "vpkg %%v27,%%v26,%%v2 \n\t" + "vperm %%v26,%%v26,%%v2,%%v1 \n\t" - "vlef %%v28,192(%%r1,%2),0 \n\t" - "vlef %%v29,196(%%r1,%2),0 \n\t" - "vlef %%v28,200(%%r1,%2),1 \n\t" - "vlef %%v29,204(%%r1,%2),1 \n\t" - "vlef %%v28,208(%%r1,%2),2 \n\t" - "vlef %%v29,212(%%r1,%2),2 \n\t" - "vlef %%v28,216(%%r1,%2),3 \n\t" - "vlef %%v29,220(%%r1,%2),3 \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v2,208(%%r1,%2) \n\t" + "vpkg %%v29,%%v28,%%v2 \n\t" + "vperm %%v28,%%v28,%%v2,%%v1 \n\t" - "vlef %%v30,224(%%r1,%2),0 \n\t" - "vlef %%v31,228(%%r1,%2),0 \n\t" - "vlef %%v30,232(%%r1,%2),1 \n\t" - "vlef %%v31,236(%%r1,%2),1 \n\t" - "vlef %%v30,240(%%r1,%2),2 \n\t" - "vlef %%v31,244(%%r1,%2),2 \n\t" - "vlef %%v30,248(%%r1,%2),3 \n\t" - "vlef %%v31,252(%%r1,%2),3 \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v2,240(%%r1,%2) \n\t" + "vpkg %%v31,%%v30,%%v2 \n\t" + "vperm %%v30,%%v30,%%v2,%%v1 \n\t" "vflpsb %%v16,%%v16 \n\t" "vflpsb %%v17,%%v17 \n\t" @@ -178,7 +162,7 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "ler %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return amin; diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index a9e7f91fc..5129ca6ee 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -57,6 +57,22 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vleig %%v2,3,1 \n\t" "vrepig %%v3,16 \n\t" "vzero %%v4 \n\t" + "vleib %%v9,0,0 \n\t" + "vleib %%v9,1,1 \n\t" + "vleib %%v9,2,2 \n\t" + "vleib %%v9,3,3 \n\t" + "vleib %%v9,8,4 \n\t" + "vleib %%v9,9,5 \n\t" + "vleib %%v9,10,6 \n\t" + "vleib %%v9,11,7 \n\t" + "vleib %%v9,16,8 \n\t" + "vleib %%v9,17,9 \n\t" + "vleib %%v9,18,10 \n\t" + "vleib %%v9,19,11 \n\t" + "vleib %%v9,24,12 \n\t" + "vleib %%v9,25,13 \n\t" + "vleib %%v9,26,14 \n\t" + "vleib %%v9,27,15 \n\t" "vleif %%v24,0,0 \n\t" "vleif %%v24,1,1 \n\t" "vleif %%v24,2,2 \n\t" @@ -78,41 +94,25 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" - "vlef %%v16,0(%%r1,%3),0 \n\t" - "vlef %%v17,4(%%r1,%3),0 \n\t" - "vlef %%v16,8(%%r1,%3),1 \n\t" - "vlef %%v17,12(%%r1,%3),1 \n\t" - "vlef %%v16,16(%%r1,%3),2 \n\t" - "vlef %%v17,20(%%r1,%3),2 \n\t" - "vlef %%v16,24(%%r1,%3),3 \n\t" - "vlef %%v17,28(%%r1,%3),3 \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v28,16(%%r1,%3) \n\t" + "vpkg %%v17,%%v16,%%v28 \n\t" + "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - "vlef %%v18,32(%%r1,%3),0 \n\t" - "vlef %%v19,36(%%r1,%3),0 \n\t" - "vlef %%v18,40(%%r1,%3),1 \n\t" - "vlef %%v19,44(%%r1,%3),1 \n\t" - "vlef %%v18,48(%%r1,%3),2 \n\t" - "vlef %%v19,52(%%r1,%3),2 \n\t" - "vlef %%v18,56(%%r1,%3),3 \n\t" - "vlef %%v19,60(%%r1,%3),3 \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v29,48(%%r1,%3) \n\t" + "vpkg %%v19,%%v18,%%v29 \n\t" + "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - "vlef %%v20,64(%%r1,%3),0 \n\t" - "vlef %%v21,68(%%r1,%3),0 \n\t" - "vlef %%v20,72(%%r1,%3),1 \n\t" - "vlef %%v21,76(%%r1,%3),1 \n\t" - "vlef %%v20,80(%%r1,%3),2 \n\t" - "vlef %%v21,84(%%r1,%3),2 \n\t" - "vlef %%v20,88(%%r1,%3),3 \n\t" - "vlef %%v21,92(%%r1,%3),3 \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v30,80(%%r1,%3) \n\t" + "vpkg %%v21,%%v20,%%v30 \n\t" + "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - "vlef %%v22,96(%%r1,%3),0 \n\t" - "vlef %%v23,100(%%r1,%3),0 \n\t" - "vlef %%v22,104(%%r1,%3),1 \n\t" - "vlef %%v23,108(%%r1,%3),1 \n\t" - "vlef %%v22,112(%%r1,%3),2 \n\t" - "vlef %%v23,116(%%r1,%3),2 \n\t" - "vlef %%v22,120(%%r1,%3),3 \n\t" - "vlef %%v23,124(%%r1,%3),3 \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v31,112(%%r1,%3) \n\t" + "vpkg %%v23,%%v22,%%v31 \n\t" + "vperm %%v22,%%v22,%%v31,%%v9 \n\t" "vflpsb %%v16, %%v16 \n\t" "vflpsb %%v17, %%v17 \n\t" @@ -151,41 +151,25 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vlef %%v16,128(%%r1,%3),0 \n\t" - "vlef %%v17,132(%%r1,%3),0 \n\t" - "vlef %%v16,136(%%r1,%3),1 \n\t" - "vlef %%v17,140(%%r1,%3),1 \n\t" - "vlef %%v16,144(%%r1,%3),2 \n\t" - "vlef %%v17,148(%%r1,%3),2 \n\t" - "vlef %%v16,152(%%r1,%3),3 \n\t" - "vlef %%v17,156(%%r1,%3),3 \n\t" + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v28,144(%%r1,%3) \n\t" + "vpkg %%v17,%%v16,%%v28 \n\t" + "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - "vlef %%v18,160(%%r1,%3),0 \n\t" - "vlef %%v19,164(%%r1,%3),0 \n\t" - "vlef %%v18,168(%%r1,%3),1 \n\t" - "vlef %%v19,172(%%r1,%3),1 \n\t" - "vlef %%v18,176(%%r1,%3),2 \n\t" - "vlef %%v19,180(%%r1,%3),2 \n\t" - "vlef %%v18,184(%%r1,%3),3 \n\t" - "vlef %%v19,188(%%r1,%3),3 \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v29,176(%%r1,%3) \n\t" + "vpkg %%v19,%%v18,%%v29 \n\t" + "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - "vlef %%v20,192(%%r1,%3),0 \n\t" - "vlef %%v21,196(%%r1,%3),0 \n\t" - "vlef %%v20,200(%%r1,%3),1 \n\t" - "vlef %%v21,204(%%r1,%3),1 \n\t" - "vlef %%v20,208(%%r1,%3),2 \n\t" - "vlef %%v21,212(%%r1,%3),2 \n\t" - "vlef %%v20,216(%%r1,%3),3 \n\t" - "vlef %%v21,220(%%r1,%3),3 \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v30,208(%%r1,%3) \n\t" + "vpkg %%v21,%%v20,%%v30 \n\t" + "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - "vlef %%v22,224(%%r1,%3),0 \n\t" - "vlef %%v23,228(%%r1,%3),0 \n\t" - "vlef %%v22,232(%%r1,%3),1 \n\t" - "vlef %%v23,236(%%r1,%3),1 \n\t" - "vlef %%v22,240(%%r1,%3),2 \n\t" - "vlef %%v23,244(%%r1,%3),2 \n\t" - "vlef %%v22,248(%%r1,%3),3 \n\t" - "vlef %%v23,252(%%r1,%3),3 \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v31,240(%%r1,%3) \n\t" + "vpkg %%v23,%%v22,%%v31 \n\t" + "vperm %%v22,%%v22,%%v31,%%v9 \n\t" "vflpsb %%v16, %%v16 \n\t" "vflpsb %%v17, %%v17 \n\t" @@ -258,7 +242,7 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "nop " :"=r"(iamax),"=m"(*amax) :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return iamax; diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index faf5f9c65..05068b212 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -57,6 +57,22 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vleig %%v2,3,1 \n\t" "vrepig %%v3,16 \n\t" "vzero %%v4 \n\t" + "vleib %%v9,0,0 \n\t" + "vleib %%v9,1,1 \n\t" + "vleib %%v9,2,2 \n\t" + "vleib %%v9,3,3 \n\t" + "vleib %%v9,8,4 \n\t" + "vleib %%v9,9,5 \n\t" + "vleib %%v9,10,6 \n\t" + "vleib %%v9,11,7 \n\t" + "vleib %%v9,16,8 \n\t" + "vleib %%v9,17,9 \n\t" + "vleib %%v9,18,10 \n\t" + "vleib %%v9,19,11 \n\t" + "vleib %%v9,24,12 \n\t" + "vleib %%v9,25,13 \n\t" + "vleib %%v9,26,14 \n\t" + "vleib %%v9,27,15 \n\t" "vleif %%v24,0,0 \n\t" "vleif %%v24,1,1 \n\t" "vleif %%v24,2,2 \n\t" @@ -78,41 +94,25 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" - "vlef %%v16,0(%%r1,%3),0 \n\t" - "vlef %%v17,4(%%r1,%3),0 \n\t" - "vlef %%v16,8(%%r1,%3),1 \n\t" - "vlef %%v17,12(%%r1,%3),1 \n\t" - "vlef %%v16,16(%%r1,%3),2 \n\t" - "vlef %%v17,20(%%r1,%3),2 \n\t" - "vlef %%v16,24(%%r1,%3),3 \n\t" - "vlef %%v17,28(%%r1,%3),3 \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v28,16(%%r1,%3) \n\t" + "vpkg %%v17,%%v16,%%v28 \n\t" + "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - "vlef %%v18,32(%%r1,%3),0 \n\t" - "vlef %%v19,36(%%r1,%3),0 \n\t" - "vlef %%v18,40(%%r1,%3),1 \n\t" - "vlef %%v19,44(%%r1,%3),1 \n\t" - "vlef %%v18,48(%%r1,%3),2 \n\t" - "vlef %%v19,52(%%r1,%3),2 \n\t" - "vlef %%v18,56(%%r1,%3),3 \n\t" - "vlef %%v19,60(%%r1,%3),3 \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v29,48(%%r1,%3) \n\t" + "vpkg %%v19,%%v18,%%v29 \n\t" + "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - "vlef %%v20,64(%%r1,%3),0 \n\t" - "vlef %%v21,68(%%r1,%3),0 \n\t" - "vlef %%v20,72(%%r1,%3),1 \n\t" - "vlef %%v21,76(%%r1,%3),1 \n\t" - "vlef %%v20,80(%%r1,%3),2 \n\t" - "vlef %%v21,84(%%r1,%3),2 \n\t" - "vlef %%v20,88(%%r1,%3),3 \n\t" - "vlef %%v21,92(%%r1,%3),3 \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v30,80(%%r1,%3) \n\t" + "vpkg %%v21,%%v20,%%v30 \n\t" + "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - "vlef %%v22,96(%%r1,%3),0 \n\t" - "vlef %%v23,100(%%r1,%3),0 \n\t" - "vlef %%v22,104(%%r1,%3),1 \n\t" - "vlef %%v23,108(%%r1,%3),1 \n\t" - "vlef %%v22,112(%%r1,%3),2 \n\t" - "vlef %%v23,116(%%r1,%3),2 \n\t" - "vlef %%v22,120(%%r1,%3),3 \n\t" - "vlef %%v23,124(%%r1,%3),3 \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v31,112(%%r1,%3) \n\t" + "vpkg %%v23,%%v22,%%v31 \n\t" + "vperm %%v22,%%v22,%%v31,%%v9 \n\t" "vflpsb %%v16, %%v16 \n\t" "vflpsb %%v17, %%v17 \n\t" @@ -151,41 +151,25 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vlef %%v16,128(%%r1,%3),0 \n\t" - "vlef %%v17,132(%%r1,%3),0 \n\t" - "vlef %%v16,136(%%r1,%3),1 \n\t" - "vlef %%v17,140(%%r1,%3),1 \n\t" - "vlef %%v16,144(%%r1,%3),2 \n\t" - "vlef %%v17,148(%%r1,%3),2 \n\t" - "vlef %%v16,152(%%r1,%3),3 \n\t" - "vlef %%v17,156(%%r1,%3),3 \n\t" + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v28,144(%%r1,%3) \n\t" + "vpkg %%v17,%%v16,%%v28 \n\t" + "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - "vlef %%v18,160(%%r1,%3),0 \n\t" - "vlef %%v19,164(%%r1,%3),0 \n\t" - "vlef %%v18,168(%%r1,%3),1 \n\t" - "vlef %%v19,172(%%r1,%3),1 \n\t" - "vlef %%v18,176(%%r1,%3),2 \n\t" - "vlef %%v19,180(%%r1,%3),2 \n\t" - "vlef %%v18,184(%%r1,%3),3 \n\t" - "vlef %%v19,188(%%r1,%3),3 \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v29,176(%%r1,%3) \n\t" + "vpkg %%v19,%%v18,%%v29 \n\t" + "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - "vlef %%v20,192(%%r1,%3),0 \n\t" - "vlef %%v21,196(%%r1,%3),0 \n\t" - "vlef %%v20,200(%%r1,%3),1 \n\t" - "vlef %%v21,204(%%r1,%3),1 \n\t" - "vlef %%v20,208(%%r1,%3),2 \n\t" - "vlef %%v21,212(%%r1,%3),2 \n\t" - "vlef %%v20,216(%%r1,%3),3 \n\t" - "vlef %%v21,220(%%r1,%3),3 \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v30,208(%%r1,%3) \n\t" + "vpkg %%v21,%%v20,%%v30 \n\t" + "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - "vlef %%v22,224(%%r1,%3),0 \n\t" - "vlef %%v23,228(%%r1,%3),0 \n\t" - "vlef %%v22,232(%%r1,%3),1 \n\t" - "vlef %%v23,236(%%r1,%3),1 \n\t" - "vlef %%v22,240(%%r1,%3),2 \n\t" - "vlef %%v23,244(%%r1,%3),2 \n\t" - "vlef %%v22,248(%%r1,%3),3 \n\t" - "vlef %%v23,252(%%r1,%3),3 \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v31,240(%%r1,%3) \n\t" + "vpkg %%v23,%%v22,%%v31 \n\t" + "vperm %%v22,%%v22,%%v31,%%v9 \n\t" "vflpsb %%v16, %%v16 \n\t" "vflpsb %%v17, %%v17 \n\t" @@ -258,7 +242,7 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "nop " :"=r"(iamin),"=m"(*amin) :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return iamin; diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index b7214783f..cc6347127 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -132,7 +132,7 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) "ldr %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return amax; diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index d53fdb6b8..18610daea 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -132,7 +132,7 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) "ldr %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return amin; From 29416cb5a37b990052d019f66736af5263a81809 Mon Sep 17 00:00:00 2001 From: maamountki Date: Thu, 31 Jan 2019 19:11:11 +0200 Subject: [PATCH 26/26] [ZARCH] Add Z13 version for max/min functions --- kernel/zarch/KERNEL.Z13 | 12 +-- kernel/zarch/damax_z13.c | 204 ++++++++++++++++++++++++++++++++++++ kernel/zarch/damin_z13.c | 204 ++++++++++++++++++++++++++++++++++++ kernel/zarch/dmax_z13.c | 180 +++++++++++++++++++++++++++++++ kernel/zarch/dmin_z13.c | 180 +++++++++++++++++++++++++++++++ kernel/zarch/zamax_z13.c | 221 +++++++++++++++++++++++++++++++++++++++ kernel/zarch/zamin_z13.c | 221 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 1216 insertions(+), 6 deletions(-) create mode 100644 kernel/zarch/damax_z13.c create mode 100644 kernel/zarch/damin_z13.c create mode 100644 kernel/zarch/dmax_z13.c create mode 100644 kernel/zarch/dmin_z13.c create mode 100644 kernel/zarch/zamax_z13.c create mode 100644 kernel/zarch/zamin_z13.c diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13 index e5b974ab4..22c7e9703 100644 --- a/kernel/zarch/KERNEL.Z13 +++ b/kernel/zarch/KERNEL.Z13 @@ -1,18 +1,18 @@ SAMAXKERNEL = ../arm/amax.c -DAMAXKERNEL = damax.c +DAMAXKERNEL = damax_z13.c CAMAXKERNEL = ../arm/zamax.c -ZAMAXKERNEL = zamax.c +ZAMAXKERNEL = zamax_z13.c SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = damin.c +DAMINKERNEL = damin_z13.c CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = zamin.c +ZAMINKERNEL = zamin_z13.c SMAXKERNEL = ../arm/max.c -DMAXKERNEL = dmax.c +DMAXKERNEL = dmax_z13.c SMINKERNEL = ../arm/min.c -DMINKERNEL = dmin.c +DMINKERNEL = dmin_z13.c ISAMAXKERNEL = ../arm/iamax.c IDAMAXKERNEL = idamax.c diff --git a/kernel/zarch/damax_z13.c b/kernel/zarch/damax_z13.c new file mode 100644 index 000000000..95b94ee4a --- /dev/null +++ b/kernel/zarch/damax_z13.c @@ -0,0 +1,204 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + maxf = damax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); + + } else { + + maxf=ABS(x[0]); + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/damin_z13.c b/kernel/zarch/damin_z13.c new file mode 100644 index 000000000..538690ee5 --- /dev/null +++ b/kernel/zarch/damin_z13.c @@ -0,0 +1,204 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = damin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); + + } else { + + minf=ABS(x[0]); + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/dmax_z13.c b/kernel/zarch/dmax_z13.c new file mode 100644 index 000000000..83e7b02a8 --- /dev/null +++ b/kernel/zarch/dmax_z13.c @@ -0,0 +1,180 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT max; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(max) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return max; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + maxf = dmax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=x[0]; + i++; + } + + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { + + maxf=x[0]; + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/dmin_z13.c b/kernel/zarch/dmin_z13.c new file mode 100644 index 000000000..e64f90ee3 --- /dev/null +++ b/kernel/zarch/dmin_z13.c @@ -0,0 +1,180 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT min; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return min; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = dmin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=x[0]; + i++; + } + + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { + + minf=x[0]; + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/zamax_z13.c b/kernel/zarch/zamax_z13.c new file mode 100644 index 000000000..ae711c173 --- /dev/null +++ b/kernel/zarch/zamax_z13.c @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vleg %%v0,0(%2),0 \n\t" + "vleg %%v16,8(%2),0 \n\t" + "vleg %%v0,16(%2),1 \n\t" + "vleg %%v16,24(%2),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v16,%%v16 \n\t" + "vfadb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vleg %%v16,0(%%r1,%2),0 \n\t" + "vleg %%v17,8(%%r1,%2),0 \n\t" + "vleg %%v16,16(%%r1,%2),1 \n\t" + "vleg %%v17,24(%%r1,%2),1 \n\t" + "vleg %%v18,32(%%r1,%2),0 \n\t" + "vleg %%v19,40(%%r1,%2),0 \n\t" + "vleg %%v18,48(%%r1,%2),1 \n\t" + "vleg %%v19,56(%%r1,%2),1 \n\t" + "vleg %%v20,64(%%r1,%2),0 \n\t" + "vleg %%v21,72(%%r1,%2),0 \n\t" + "vleg %%v20,80(%%r1,%2),1 \n\t" + "vleg %%v21,88(%%r1,%2),1 \n\t" + "vleg %%v22,96(%%r1,%2),0 \n\t" + "vleg %%v23,104(%%r1,%2),0 \n\t" + "vleg %%v22,112(%%r1,%2),1 \n\t" + "vleg %%v23,120(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vleg %%v16,128(%%r1,%2),0 \n\t" + "vleg %%v17,136(%%r1,%2),0 \n\t" + "vleg %%v16,144(%%r1,%2),1 \n\t" + "vleg %%v17,152(%%r1,%2),1 \n\t" + "vleg %%v18,160(%%r1,%2),0 \n\t" + "vleg %%v19,168(%%r1,%2),0 \n\t" + "vleg %%v18,176(%%r1,%2),1 \n\t" + "vleg %%v19,184(%%r1,%2),1 \n\t" + "vleg %%v20,192(%%r1,%2),0 \n\t" + "vleg %%v21,200(%%r1,%2),0 \n\t" + "vleg %%v20,208(%%r1,%2),1 \n\t" + "vleg %%v21,216(%%r1,%2),1 \n\t" + "vleg %%v22,224(%%r1,%2),0 \n\t" + "vleg %%v23,232(%%r1,%2),0 \n\t" + "vleg %%v22,240(%%r1,%2),1 \n\t" + "vleg %%v23,248(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + maxf = zamax_kernel_16(n1, x); + ix = n1 * 2; + i = n1; + } + else + { + maxf=CABS1(x,0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (maxf); + + } else { + + maxf=CABS1(x,0); + inc_x2 = 2 * inc_x; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); + } + if (CABS1(x,ix+inc_x2) > maxf) { + maxf = CABS1(x,ix+inc_x2); + } + if (CABS1(x,ix+inc_x2*2) > maxf) { + maxf = CABS1(x,ix+inc_x2*2); + } + if (CABS1(x,ix+inc_x2*3) > maxf) { + maxf = CABS1(x,ix+inc_x2*3); + } + + ix += inc_x2 * 4; + + i += 4; + + } + + + while (i < n) { + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (maxf); + } +} diff --git a/kernel/zarch/zamin_z13.c b/kernel/zarch/zamin_z13.c new file mode 100644 index 000000000..f82c57e81 --- /dev/null +++ b/kernel/zarch/zamin_z13.c @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vleg %%v0,0(%2),0 \n\t" + "vleg %%v16,8(%2),0 \n\t" + "vleg %%v0,16(%2),1 \n\t" + "vleg %%v16,24(%2),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v16,%%v16 \n\t" + "vfadb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vleg %%v16,0(%%r1,%2),0 \n\t" + "vleg %%v17,8(%%r1,%2),0 \n\t" + "vleg %%v16,16(%%r1,%2),1 \n\t" + "vleg %%v17,24(%%r1,%2),1 \n\t" + "vleg %%v18,32(%%r1,%2),0 \n\t" + "vleg %%v19,40(%%r1,%2),0 \n\t" + "vleg %%v18,48(%%r1,%2),1 \n\t" + "vleg %%v19,56(%%r1,%2),1 \n\t" + "vleg %%v20,64(%%r1,%2),0 \n\t" + "vleg %%v21,72(%%r1,%2),0 \n\t" + "vleg %%v20,80(%%r1,%2),1 \n\t" + "vleg %%v21,88(%%r1,%2),1 \n\t" + "vleg %%v22,96(%%r1,%2),0 \n\t" + "vleg %%v23,104(%%r1,%2),0 \n\t" + "vleg %%v22,112(%%r1,%2),1 \n\t" + "vleg %%v23,120(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vleg %%v16,128(%%r1,%2),0 \n\t" + "vleg %%v17,136(%%r1,%2),0 \n\t" + "vleg %%v16,144(%%r1,%2),1 \n\t" + "vleg %%v17,152(%%r1,%2),1 \n\t" + "vleg %%v18,160(%%r1,%2),0 \n\t" + "vleg %%v19,168(%%r1,%2),0 \n\t" + "vleg %%v18,176(%%r1,%2),1 \n\t" + "vleg %%v19,184(%%r1,%2),1 \n\t" + "vleg %%v20,192(%%r1,%2),0 \n\t" + "vleg %%v21,200(%%r1,%2),0 \n\t" + "vleg %%v20,208(%%r1,%2),1 \n\t" + "vleg %%v21,216(%%r1,%2),1 \n\t" + "vleg %%v22,224(%%r1,%2),0 \n\t" + "vleg %%v23,232(%%r1,%2),0 \n\t" + "vleg %%v22,240(%%r1,%2),1 \n\t" + "vleg %%v23,248(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + minf = zamin_kernel_16(n1, x); + ix = n1 * 2; + i = n1; + } + else + { + minf=CABS1(x,0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (minf); + + } else { + + minf=CABS1(x,0); + inc_x2 = 2 * inc_x; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); + } + if (CABS1(x,ix+inc_x2) < minf) { + minf = CABS1(x,ix+inc_x2); + } + if (CABS1(x,ix+inc_x2*2) < minf) { + minf = CABS1(x,ix+inc_x2*2); + } + if (CABS1(x,ix+inc_x2*3) < minf) { + minf = CABS1(x,ix+inc_x2*3); + } + + ix += inc_x2 * 4; + + i += 4; + + } + + + while (i < n) { + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (minf); + } +}