diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index b025f8634..29ec96f73 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -43,11 +43,6 @@ jobs: - name: Update Homebrew if: github.event_name != 'pull_request' run: brew update || true - - - name: unlink installed gcc to allow updating - run: | - brew unlink gcc@8 - brew unlink gcc@9 - name: Install prerequisites run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index be9a32a7c..6be41960c 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -194,3 +194,6 @@ In chronological order: * PingTouGe Semiconductor Co., Ltd. * [2020-10] Add RISC-V Vector (0.7.1) support. Optimize BLAS kernels for Xuantie C910 + +* River Dillon + * [2021-07-10] fix compilation with musl libc diff --git a/Changelog.txt b/Changelog.txt index 6c5cf573e..8cd101699 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,52 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.16 + 11-Jul-2021 + +common: + - drastically reduced the stack size requirements for running the LAPACK + testsuite (Reference-LAPACK PR 553) + - fixed spurious test failures in the LAPACK testsuite (Reference-LAPACK + PR 564) + - expressly setting DYNAMIC_ARCH=0 no longer enables dynamic_arch mode + - improved performance of xGER, xSPR, xSPR2, xSYR, xSYR2, xTRSV, SGEMV_N + and DGEMV_N, for small input sizes and consecutive arguments + - improved performance of xGETRF, xPORTF and xPOTRI for small input sizes + by disabling multithreading + - fixed installing with BSD versions of the "install" utility + +RISCV: + - fixed the implementation of xIMIN + - improved the performance of DSDOT + - fixed linking of the tests on C910V with current vendor gcc + +POWER: +- fixed SBGEMM computation for some odd value inputs +- fixed compilation for PPCG4, PPC970, POWER3, POWER4 and POWER5 + +x86_64: + - improved performance of SGEMV_N and SGEMV_T for small N on AVX512-capable cpus + - worked around a miscompilation of ZGEMM/ZTRMM on Sandybridge with old gcc + versions + - fixed compilation with MS Visual Studio versions older than 2017 + - fixed macro name collision with winnt.h from the latest Win10 SDK + - added cpu type autodetection for Intel Ice Lake SP + - fixed cpu type autodetection for Intel Tiger Lake + - added cpu type autodetection for recent Centaur/Zhaoxin models + - fixed compilation with musl libc + +ARM64: +- fixed compilation with gcc/gfortran on the Apple M1 +- fixed linking of the tests on FreeBSD +- fixed missing restore of a register in the recently rewritten DNRM2 kernel + for ThunderX2 and Neoverse N1 that could cause spurious failures in e.g. + DGEEV +- added compiler optimization flags for the EMAG8180 +- added initial support for Cortex A55 + +ARM: +- fixed linking of the tests on FreeBSD + ==================================================================== Version 0.3.15 2-May-2021 diff --git a/Makefile.arm64 b/Makefile.arm64 index 23362b4e5..c23a0876e 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -57,6 +57,28 @@ endif endif endif +# Use a53 tunings because a55 is only available in GCC>=8.1 +ifeq ($(CORE), CORTEXA55) +ifeq ($(GCCVERSIONGTEQ7), 1) +ifeq ($(GCCVERSIONGTEQ8), 1) +CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 +endif +else +CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a53 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a53 +endif +endif +else +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 +endif +endif +endif + ifeq ($(CORE), THUNDERX) CCOMMON_OPT += -march=armv8-a -mtune=thunderx ifneq ($(F_COMPILER), NAG) @@ -107,4 +129,13 @@ FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 endif endif endif + +ifeq ($(GCCVERSIONGTEQ9), 1) +ifeq ($(CORE), EMAG8180) +CCOMMON_OPT += -march=armv8-a -mtune=emag +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a -mtune=emag +endif +endif +endif endif diff --git a/Makefile.install b/Makefile.install index e8b64465f..28727de37 100644 --- a/Makefile.install +++ b/Makefile.install @@ -74,17 +74,17 @@ endif ifneq ($(OSNAME), AIX) ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h" - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" + @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h" + @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" + @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" + @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" + @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" endif #for install static library ifneq ($(NO_STATIC),1) @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif @@ -92,7 +92,7 @@ endif ifneq ($(NO_SHARED),1) @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) - @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) diff --git a/Makefile.system b/Makefile.system index ae703e4d9..bb8c60e91 100644 --- a/Makefile.system +++ b/Makefile.system @@ -333,6 +333,7 @@ GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) +GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) @@ -380,6 +381,12 @@ ifeq ($(OSNAME), AIX) EXTRALIB += -lm endif +ifeq ($(OSNAME), FreeBSD) +ifeq ($(ARCH), $(filter $(ARCH),arm arm64)) +EXTRALIB += -lm +endif +endif + ifeq ($(OSNAME), WINNT) NEED_PIC = 0 NO_EXPRECISION = 1 @@ -619,6 +626,7 @@ DYNAMIC_CORE += CORTEXA57 DYNAMIC_CORE += CORTEXA72 DYNAMIC_CORE += CORTEXA73 DYNAMIC_CORE += NEOVERSEN1 +DYNAMIC_CORE += CORTEXA55 DYNAMIC_CORE += FALKOR DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 diff --git a/Makefile.x86 b/Makefile.x86 index 893379c33..25ca660bd 100644 --- a/Makefile.x86 +++ b/Makefile.x86 @@ -1,6 +1,6 @@ # COMPILER_PREFIX = mingw32- -ifndef DYNAMIC_ARCH +ifneq ($(DYNAMIC_ARCH),1) ADD_CPUFLAGS = 1 else ifdef TARGET_CORE diff --git a/Makefile.x86_64 b/Makefile.x86_64 index f62ab9e5e..307cbe1d9 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -9,7 +9,7 @@ endif endif -ifndef DYNAMIC_ARCH +ifneq ($(DYNAMIC_ARCH),1) ADD_CPUFLAGS = 1 else ifdef TARGET_CORE diff --git a/README.md b/README.md index 174f951f4..d7e0d60a7 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ We provide official binary packages for the following platform: * Windows x86/x86_64 -You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/). +You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the Releases section of the github project page, [https://github.com/xianyi/OpenBLAS/releases](https://github.com/xianyi/OpenBLAS/releases). ## Installation from Source diff --git a/TargetList.txt b/TargetList.txt index d19964916..f93a629d8 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -92,6 +92,7 @@ CORTEXA57 CORTEXA72 CORTEXA73 NEOVERSEN1 +CORTEXA55 EMAG8180 FALKOR THUNDERX diff --git a/appveyor.yml b/appveyor.yml index c9b2fa3a1..d575c5b7f 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -47,6 +47,7 @@ environment: install: - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force + - if [%COMPILER%]==[clang-cl] conda config --set auto_update_conda false - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1 - if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 - if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%" diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 56a3fd4ae..889b920e3 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -137,3 +137,31 @@ jobs: source /opt/intel/oneapi/setvars.sh make CC=/usr/local/opt/llvm/bin/clang FC=ifort +- job: OSX_NDK_ARMV7 + pool: + vmImage: 'macOS-10.15' + steps: + - script: | + brew update + brew install --cask android-ndk + export ANDROID_NDK_HOME=/usr/local/share/android-ndk + make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4 + +- job: ALPINE_MUSL + pool: + vmImage: 'ubuntu-latest' + steps: + - script: | + wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ + && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1 + alpine() { /alpine/enter-chroot -u "$USER" "$@"; } + sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' + alpine make DYNAMIC_ARCH=1 BINARY=64 + alpine make DYNAMIC_ARCH=1 BINARY=64 PREFIX=mytestdir install + alpine ls -l mytestdir/include + alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c + alpine echo "#include " >>test_install.c + alpine echo "int main(){" >> test_install.c + alpine echo "cpu_set_t* cpu_set = NULL;}" >>test_install.c + alpine gcc -Imytestdir/include test_install.c -Lmytestdir/lib -lopenblas -lpthread -lgfortran -o test_install + diff --git a/benchmark/getri.c b/benchmark/getri.c index 98a860906..4c8891226 100644 --- a/benchmark/getri.c +++ b/benchmark/getri.c @@ -72,13 +72,17 @@ int main(int argc, char *argv[]){ FLOAT *a,*work; FLOAT wkopt[4]; blasint *ipiv; - blasint m, i, j, info,lwork; + blasint m, i, j, l, info,lwork; int from = 1; int to = 200; int step = 1; + int loops = 1; - double time1; + double time1,timeg; + + char *p; + char btest = 'I'; argc--;argv++; @@ -86,6 +90,9 @@ int main(int argc, char *argv[]){ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} + if ((p = getenv("OPENBLAS_TEST"))) btest=*p; + + if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step); @@ -124,32 +131,41 @@ int main(int argc, char *argv[]){ fprintf(stderr, " SIZE FLops Time Lwork\n"); for(m = from; m <= to; m += step){ - + timeg = 0.; fprintf(stderr, " %6d : ", (int)m); - GETRF (&m, &m, a, &m, ipiv, &info); + for (l = 0; l < loops; l++) { + if (btest == 'F') begin(); + GETRF (&m, &m, a, &m, ipiv, &info); + if (btest == 'F') { + end(); + timeg += getsec(); + } if (info) { fprintf(stderr, "Matrix is not singular .. %d\n", info); exit(1); } - begin(); + if (btest == 'I') begin(); lwork = -1; GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info); lwork = (blasint)wkopt[0]; GETRI(&m, a, &m, ipiv, work, &lwork, &info); - end(); + if (btest == 'I') end(); if (info) { fprintf(stderr, "failed compute inverse matrix .. %d\n", info); exit(1); } - time1 = getsec(); - + if (btest == 'I') + timeg += getsec(); + + } // loops + time1 = timeg/(double)loops; fprintf(stderr, " %10.2f MFlops : %10.2f Sec : %d\n", COMPSIZE * COMPSIZE * (4.0/3.0 * (double)m * (double)m *(double)m - (double)m *(double)m + 5.0/3.0* (double)m) / time1 * 1.e-6,time1,lwork); diff --git a/benchmark/linpack.c b/benchmark/linpack.c index 202035245..32ccb0386 100644 --- a/benchmark/linpack.c +++ b/benchmark/linpack.c @@ -72,17 +72,21 @@ int main(int argc, char *argv[]){ FLOAT *a, *b; blasint *ipiv; - blasint m, i, j, info; + blasint m, i, j, l, info; blasint unit = 1; int from = 1; int to = 200; int step = 1; + int loops = 1; FLOAT maxerr; - double time1, time2; + double time1, time2, timeg1,timeg2; + char *p; + if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; + argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} @@ -110,9 +114,9 @@ int main(int argc, char *argv[]){ fprintf(stderr, " SIZE Residual Decompose Solve Total\n"); for(m = from; m <= to; m += step){ - + timeg1 = timeg2 = 0.; fprintf(stderr, " %6d : ", (int)m); - + for (l = 0; l < loops; l++) { for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; @@ -138,7 +142,7 @@ int main(int argc, char *argv[]){ exit(1); } - time1 = getsec(); + timeg1 += getsec(); begin(); @@ -151,8 +155,10 @@ int main(int argc, char *argv[]){ exit(1); } - time2 = getsec(); - + timeg2 += getsec(); + } //loops + time1=timeg1/(double)loops; + time2=timeg2/(double)loops; maxerr = 0.; for(i = 0; i < m; i++){ diff --git a/benchmark/potrf.c b/benchmark/potrf.c index 116d0cca5..8808203a5 100644 --- a/benchmark/potrf.c +++ b/benchmark/potrf.c @@ -99,14 +99,15 @@ int main(int argc, char *argv[]){ char *p; char btest = 'F'; - blasint m, i, j, info, uplos=0; - double flops; + blasint m, i, j, l, info, uplos=0; + double flops = 0.; int from = 1; int to = 200; int step = 1; + int loops = 1; - double time1; + double time1, timeg; argc--;argv++; @@ -119,6 +120,8 @@ int main(int argc, char *argv[]){ if ((p = getenv("OPENBLAS_TEST"))) btest=*p; + if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c\n", from, to, step,*uplo[uplos]); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ @@ -129,19 +132,21 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } - for(m = from; m <= to; m += step){ + for(m = from; m <= to; m += step){ + timeg=0.; + for (l = 0; l < loops; l++) { #ifndef COMPLEX if (uplos & 1) { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = 0.; - a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; + a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5; } } else { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5; - a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; + a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = 0.; } } @@ -192,8 +197,8 @@ int main(int argc, char *argv[]){ exit(1); } - time1 = getsec(); - flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6; + if ( btest == 'F') + timeg += getsec(); if ( btest == 'S' ) { @@ -214,9 +219,7 @@ int main(int argc, char *argv[]){ fprintf(stderr, "Potrs info = %d\n", info); exit(1); } - time1 = getsec(); - flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6; - + timeg += getsec(); } if ( btest == 'I' ) @@ -232,11 +235,17 @@ int main(int argc, char *argv[]){ fprintf(stderr, "Potri info = %d\n", info); exit(1); } - - time1 = getsec(); - flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6; + timeg += getsec(); } - + } // loops + + time1 = timeg/(double)loops; + if ( btest == 'F') + flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6; + if ( btest == 'S') + flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6; + if ( btest == 'I') + flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6; fprintf(stderr, "%8d : %10.2f MFlops : %10.3f Sec : Test=%c\n",m,flops ,time1,btest); diff --git a/benchmark/syr2.c b/benchmark/syr2.c index acbc86987..61d1036ea 100644 --- a/benchmark/syr2.c +++ b/benchmark/syr2.c @@ -46,14 +46,17 @@ int main(int argc, char *argv[]){ if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; - blasint m, i, j; + blasint m, i, j, l; blasint inc_x= 1; blasint inc_y= 1; int from = 1; int to = 200; int step = 1; + int loops = 1; - double time1; + if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; + + double time1,timeg; argc--;argv++; @@ -85,8 +88,9 @@ int main(int argc, char *argv[]){ for(m = from; m <= to; m += step) { - + timeg = 0.; fprintf(stderr, " %6d : ", (int)m); + for (l = 0; l < loops; l++) { for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } @@ -107,8 +111,10 @@ int main(int argc, char *argv[]){ end(); - time1 = getsec(); + timeg += getsec(); + } // loops + time1 = timeg/(double)loops; fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time1 * 1.e-6); diff --git a/benchmark/syrk.c b/benchmark/syrk.c index 82606a21a..fa0f24666 100644 --- a/benchmark/syrk.c +++ b/benchmark/syrk.c @@ -56,17 +56,20 @@ int main(int argc, char *argv[]){ char uplo='U'; char trans='N'; - + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; - blasint m, i, j; + blasint m, i, j, l; int from = 1; int to = 200; int step = 1; + int loops = 1; - double time1; + if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; + + double time1,timeg; argc--;argv++; @@ -95,9 +98,12 @@ int main(int argc, char *argv[]){ for(m = from; m <= to; m += step) { + timeg = 0.; fprintf(stderr, " %6d : ", (int)m); + for(l = 0; l < loops; l++) { + for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; @@ -111,8 +117,10 @@ int main(int argc, char *argv[]){ end(); - time1 = getsec(); - + timeg += getsec(); + + } //loops + time1 = timeg / (double)loops; fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 4451f9eaa..154e59db6 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -44,7 +44,7 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) if (DYNAMIC_LIST) set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) endif () diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index da7686c33..d86e10035 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -177,7 +177,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) - elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53") + elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53" OR "${TCORE}" STREQUAL "CORTEXA55") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t32768\n" "#define L1_CODE_LINESIZE\t64\n" diff --git a/cmake/system.cmake b/cmake/system.cmake index d6c71b774..34874827c 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -39,7 +39,7 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") set(TARGET "BARCELONA") endif () - if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53") + if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53" OR ${TARGET} STREQUAL "CORTEXA55") set(TARGET "ARMV7") endif () endif () diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 29b5a067b..794d73d06 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -254,6 +254,19 @@ function(GenerateNamedObjects sources_in) # now add the object and set the defines set(obj_defines ${defines_in}) + list(FIND obj_defines "RC" def_idx) + if (${def_idx} GREATER -1) + # list(REMOVE_AT ${obj_defines} ${def_idx}) + list (REMOVE_ITEM obj_defines "RC") + list(APPEND obj_defines "RC=RC") + endif () + list(FIND obj_defines "CR" def_idx) + if (${def_idx} GREATER -1) + # list(REMOVE_AT ${obj_defines} ${def_idx}) + list (REMOVE_ITEM obj_defines "CR") + list(APPEND obj_defines "CR=CR") + endif () + if (use_cblas) set(obj_name "cblas_${obj_name}") list(APPEND obj_defines "CBLAS") diff --git a/common_interface.h b/common_interface.h index b9ebb2772..318827920 100644 --- a/common_interface.h +++ b/common_interface.h @@ -709,6 +709,13 @@ int BLASFUNC(cpotrf)(char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(spotri)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dpotri)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qpotri)(char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(cpotri)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(zpotri)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xpotri)(char *, blasint *, xdouble *, blasint *, blasint *); + int BLASFUNC(spotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *); int BLASFUNC(dpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *); int BLASFUNC(qpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 5f5d7771b..2a9399f7d 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -36,6 +36,7 @@ size_t length=sizeof(value); #define CPU_ARMV8 1 // Arm #define CPU_CORTEXA53 2 +#define CPU_CORTEXA55 14 #define CPU_CORTEXA57 3 #define CPU_CORTEXA72 4 #define CPU_CORTEXA73 5 @@ -67,7 +68,8 @@ static char *cpuname[] = { "EMAG8180", "NEOVERSEN1", "THUNDERX3T110", - "VORTEX" + "VORTEX", + "CORTEXA55" }; static char *cpuname_lower[] = { @@ -84,7 +86,8 @@ static char *cpuname_lower[] = { "emag8180", "neoversen1", "thunderx3t110", - "vortex" + "vortex", + "cortexa55" }; int get_feature(char *search) @@ -161,6 +164,8 @@ int detect(void) return CPU_CORTEXA73; else if (strstr(cpu_part, "0xd0c")) return CPU_NEOVERSEN1; + else if (strstr(cpu_part, "0xd05")) + return CPU_CORTEXA55; } // Qualcomm else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00")) @@ -281,6 +286,7 @@ void get_cpuconfig(void) { case CPU_CORTEXA53: + case CPU_CORTEXA55: printf("#define %s\n", cpuname[d]); // Fall-through case CPU_ARMV8: diff --git a/cpuid_x86.c b/cpuid_x86.c index 44704fcd9..00fc8baa0 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -283,6 +283,7 @@ int get_vendor(void){ if (!strcmp(vendor, "CyrixInstead")) return VENDOR_CYRIX; if (!strcmp(vendor, "NexGenDriven")) return VENDOR_NEXGEN; if (!strcmp(vendor, "CentaurHauls")) return VENDOR_CENTAUR; + if (!strcmp(vendor, " Shanghai ")) return VENDOR_CENTAUR; if (!strcmp(vendor, "RiseRiseRise")) return VENDOR_RISE; if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; @@ -1398,6 +1399,17 @@ int get_cpuname(void){ return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; + case 10: // Ice Lake SP + if(support_avx512_bf16()) + return CPUTYPE_COOPERLAKE; + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; } break; case 7: // family 6 exmodel 7 @@ -1620,7 +1632,9 @@ int get_cpuname(void){ case 0x6: return CPUTYPE_NANO; break; - + case 0x7: + return CPUTYPE_NEHALEM; + break; } return CPUTYPE_VIAC3; } @@ -2112,7 +2126,22 @@ int get_coretype(void){ #endif else return CORE_NEHALEM; -#endif +#endif + if (model == 10) +#ifndef NO_AVX512 + if(support_avx512_bf16()) + return CORE_COOPERLAKE; + return CORE_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; +#endif break; case 7: if (model == 10) @@ -2135,13 +2164,13 @@ int get_coretype(void){ case 8: if (model == 12) { // Tiger Lake if(support_avx512()) - return CPUTYPE_SKYLAKEX; + return CORE_SKYLAKEX; if(support_avx2()) - return CPUTYPE_HASWELL; + return CORE_HASWELL; if(support_avx()) - return CPUTYPE_SANDYBRIDGE; + return CORE_SANDYBRIDGE; else - return CPUTYPE_NEHALEM; + return CORE_NEHALEM; } if (model == 14) { // Kaby Lake if(support_avx()) @@ -2259,6 +2288,9 @@ int get_coretype(void){ case 0x6: return CORE_NANO; break; + case 0x7: + return CORE_NEHALEM; + break; } return CORE_VIAC3; } diff --git a/driver/level3/Makefile b/driver/level3/Makefile index 78f32b961..b8465d4ed 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -425,7 +425,7 @@ cgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -473,7 +473,7 @@ zgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -521,7 +521,7 @@ xgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -632,7 +632,7 @@ cgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -680,7 +680,7 @@ zgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -728,7 +728,7 @@ xgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -1895,7 +1895,7 @@ cgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -1943,7 +1943,7 @@ zgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -1991,7 +1991,7 @@ xgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -2048,7 +2048,7 @@ cgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -2096,7 +2096,7 @@ zgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -2144,7 +2144,7 @@ xgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -2817,7 +2817,7 @@ cgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -2865,7 +2865,7 @@ zgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -2913,7 +2913,7 @@ xgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -3025,7 +3025,7 @@ cgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -3073,7 +3073,7 @@ zgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -3121,7 +3121,7 @@ xgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -4288,7 +4288,7 @@ cgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -4336,7 +4336,7 @@ zgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -4384,7 +4384,7 @@ xgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -4441,7 +4441,7 @@ cgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -4489,7 +4489,7 @@ zgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -4537,7 +4537,7 @@ xgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 158e1b3da..1a33870db 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -404,6 +404,7 @@ static int get_vendor(void){ if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; + if (!strcmp(vendor.vchar, " Shanghai ")) return VENDOR_CENTAUR; if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; @@ -621,6 +622,22 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; } } + if (model == 10) { + // Ice Lake SP + if(support_avx512_bf16()) + return &gotoblas_COOPERLAKE; + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } return NULL; case 7: if (model == 10) // Goldmont Plus @@ -808,6 +825,9 @@ static gotoblas_t *get_coretype(void){ switch (family) { case 0x6: return &gotoblas_NANO; + break; + case 0x7: + return &gotoblas_NEHALEM; } } diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 0b623c3ac..04ceaaf6d 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -99,6 +99,11 @@ extern gotoblas_t gotoblas_NEOVERSEN1; #else #define gotoblas_NEOVERSEN1 gotoblas_ARMV8 #endif +#ifdef DYN_CORTEX_A55 +extern gotoblas_t gotoblas_CORTEXA55; +#else +#define gotoblas_CORTEXA55 gotoblas_ARMV8 +#endif #else extern gotoblas_t gotoblas_CORTEXA53; extern gotoblas_t gotoblas_CORTEXA57; @@ -111,11 +116,12 @@ extern gotoblas_t gotoblas_TSV110; extern gotoblas_t gotoblas_EMAG8180; extern gotoblas_t gotoblas_NEOVERSEN1; extern gotoblas_t gotoblas_THUNDERX3T110; +extern gotoblas_t gotoblas_CORTEXA55; #endif extern void openblas_warning(int verbose, const char * msg); -#define NUM_CORETYPES 12 +#define NUM_CORETYPES 13 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -142,6 +148,7 @@ static char *corename[] = { "emag8180", "neoversen1", "thunderx3t110", + "cortexa55", "unknown" }; @@ -158,6 +165,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11]; + if (gotoblas == &gotoblas_CORTEXA55) return corename[12]; return corename[NUM_CORETYPES]; } @@ -189,6 +197,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 9: return (&gotoblas_EMAG8180); case 10: return (&gotoblas_NEOVERSEN1); case 11: return (&gotoblas_THUNDERX3T110); + case 12: return (&gotoblas_CORTEXA55); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -247,6 +256,8 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_CORTEXA73; case 0xd0c: // Neoverse N1 return &gotoblas_NEOVERSEN1; + case 0xd05: // Cortex A55 + return &gotoblas_CORTEXA55; } break; case 0x42: // Broadcom diff --git a/driver/others/memory.c b/driver/others/memory.c index 63fa6a566..6e654ccf2 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1702,7 +1702,6 @@ inline int atoi(const char *str) { return 0; } #include #include #include -#include #include #include #include diff --git a/f_check b/f_check index 2c0d7fcb9..4825fb09a 100644 --- a/f_check +++ b/f_check @@ -314,11 +314,11 @@ if ($link ne "") { $link =~ s/\-Y\sP\,/\-Y/g; - $link =~ s/\-R\s*/\-rpath\@/g; + $link =~ s/\-R\s*/\-rpath\%/g; - $link =~ s/\-rpath\s+/\-rpath\@/g; + $link =~ s/\-rpath\s+/\-rpath\%/g; - $link =~ s/\-rpath-link\s+/\-rpath-link\@/g; + $link =~ s/\-rpath-link\s+/\-rpath-link\%/g; @flags = split(/[\s\,\n]/, $link); # remove leading and trailing quotes from each flag. @@ -344,13 +344,13 @@ if ($link ne "") { } - if ($flags =~ /^\-rpath\@/) { - $flags =~ s/\@/\,/g; + if ($flags =~ /^\-rpath\%/) { + $flags =~ s/\%/\,/g; $linker_L .= "-Wl,". $flags . " " ; } - if ($flags =~ /^\-rpath-link\@/) { - $flags =~ s/\@/\,/g; + if ($flags =~ /^\-rpath-link\%/) { + $flags =~ s/\%/\,/g; $linker_L .= "-Wl,". $flags . " " ; } if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) { diff --git a/getarch.c b/getarch.c index f48944f36..3bc8a0c3d 100644 --- a/getarch.c +++ b/getarch.c @@ -1159,6 +1159,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_CORTEXA55 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "CORTEXA55" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DCORTEXA55 " \ + "-DL1_CODE_SIZE=16384 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ + "-DL2_SIZE=65536 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "cortexa55" +#define CORENAME "CORTEXA55" +#else +#endif #ifdef FORCE_FALKOR #define FORCE diff --git a/interface/gemm.c b/interface/gemm.c index 6fde69049..10426fd8f 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -49,6 +49,8 @@ #define ERROR_NAME "QGEMM " #elif defined(DOUBLE) #define ERROR_NAME "DGEMM " +#elif defined(BFLOAT16) +#define ERROR_NAME "SBGEMM " #else #define ERROR_NAME "SGEMM " #endif @@ -124,6 +126,7 @@ void NAME(char *TRANSA, char *TRANSB, #ifdef SMP double MNK; +#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; @@ -142,6 +145,7 @@ void NAME(char *TRANSA, char *TRANSB, #endif #endif #endif +#endif #if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3) int nodes; diff --git a/interface/gemv.c b/interface/gemv.c index d5d739fb1..1f14cdb2c 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -202,6 +202,11 @@ void CNAME(enum CBLAS_ORDER order, if (alpha == ZERO) return; + if (trans == 0 && incx == 1 && incy == 1 && m*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) { + GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, NULL); + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/ger.c b/interface/ger.c index 8cf1614e3..af6ae8606 100644 --- a/interface/ger.c +++ b/interface/ger.c @@ -164,6 +164,11 @@ void CNAME(enum CBLAS_ORDER order, if (m == 0 || n == 0) return; if (alpha == 0.) return; + if (incx == 1 && incy == 1 && 1L*m*n <= 2048 *GEMM_MULTITHREAD_THRESHOLD) { + GER(m, n, 0, alpha, x, incx, y, incy, a, lda, NULL); + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/lapack/getrf.c b/interface/lapack/getrf.c index 02bb124b3..323370ebc 100644 --- a/interface/lapack/getrf.c +++ b/interface/lapack/getrf.c @@ -95,7 +95,14 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint #ifdef SMP args.common = NULL; - args.nthreads = num_cpu_avail(4); +#ifndef DOUBLE + if (args.m*args.n < 40000) +#else + if (args.m*args.n < 10000) +#endif + args.nthreads=1; + else + args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif diff --git a/interface/lapack/potrf.c b/interface/lapack/potrf.c index dbd55f62f..3abc80133 100644 --- a/interface/lapack/potrf.c +++ b/interface/lapack/potrf.c @@ -112,6 +112,13 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ #ifdef SMP args.common = NULL; +#ifndef DOUBLE + if (args.n <128) +#else + if (args.n <64) +#endif + args.nthreads = 1; + else args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { diff --git a/interface/lapack/potri.c b/interface/lapack/potri.c index 2c0c64b6f..eb0fcbe70 100644 --- a/interface/lapack/potri.c +++ b/interface/lapack/potri.c @@ -121,6 +121,9 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ #ifdef SMP args.common = NULL; + if (args.n < 180) + args.nthreads = 1; + else args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { diff --git a/interface/lapack/zgetrf.c b/interface/lapack/zgetrf.c index 7f8db94f6..d03541fad 100644 --- a/interface/lapack/zgetrf.c +++ b/interface/lapack/zgetrf.c @@ -95,7 +95,10 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint #ifdef SMP args.common = NULL; - args.nthreads = num_cpu_avail(4); + if (args.m*args.n <10000) + args.nthreads = 1; + else + args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif diff --git a/interface/lapack/zpotrf.c b/interface/lapack/zpotrf.c index c4cd99bf6..298efbbc1 100644 --- a/interface/lapack/zpotrf.c +++ b/interface/lapack/zpotrf.c @@ -112,6 +112,13 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ #ifdef SMP args.common = NULL; +#ifndef DOUBLE + if (args.n < 64) +#else + if (args.n < 64) +#endif + args.nthreads = 1; + else args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { diff --git a/interface/lapack/zpotri.c b/interface/lapack/zpotri.c index 8da211683..8748c6352 100644 --- a/interface/lapack/zpotri.c +++ b/interface/lapack/zpotri.c @@ -121,6 +121,15 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ #ifdef SMP args.nthreads = num_cpu_avail(4); +#ifndef DOUBLE + if (args.n < 200) +#else + if (args.n < 150) +#endif + args.nthreads=1; + else +#endif + args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif diff --git a/interface/spr.c b/interface/spr.c index 1956986e9..8aafc9f85 100644 --- a/interface/spr.c +++ b/interface/spr.c @@ -167,6 +167,26 @@ void CNAME(enum CBLAS_ORDER order, FUNCTION_PROFILE_START(); + if (incx == 1 && n <100) { + blasint i; + if (uplo==0) { + for (i = 0; i < n; i++){ + if (x[i] != ZERO) { + AXPYU_K(i + 1, 0, 0, alpha * x[i], x, 1, a, 1, NULL, 0); + } + a += i + 1; + } + } else { + for (i = 0; i < n; i++){ + if (x[i] != ZERO) { + AXPYU_K(n - i, 0, 0, alpha * x[i], x + i, 1, a, 1, NULL, 0); + } + a += n - i; + } + } + return; + } + if (incx < 0 ) x -= (n - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); diff --git a/interface/spr2.c b/interface/spr2.c index 73a811c3e..b5aab1767 100644 --- a/interface/spr2.c +++ b/interface/spr2.c @@ -168,6 +168,24 @@ void CNAME(enum CBLAS_ORDER order, if (alpha == ZERO) return; + if (incx == 1 && incy == 1 && n < 50) { + blasint i; + if (!uplo) { + for (i = 0; i < n; i++){ + AXPYU_K(i + 1, 0, 0, alpha * x[i], y, 1, a, 1, NULL, 0); + AXPYU_K(i + 1, 0, 0, alpha * y[i], x, 1, a, 1, NULL, 0); + a += i + 1; + } + } else { + for (i = 0; i < n; i++){ + AXPYU_K(n - i, 0, 0, alpha * x[i], y + i, 1, a, 1, NULL, 0); + AXPYU_K(n - i, 0, 0, alpha * y[i], x + i, 1, a, 1, NULL, 0); + a += n - i; + } + } + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/syr.c b/interface/syr.c index 1374bcc69..ad75264b1 100644 --- a/interface/syr.c +++ b/interface/syr.c @@ -168,7 +168,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, IDEBUG_START; FUNCTION_PROFILE_START(); +#if 1 + if (incx == 1 && n < 100) { + BLASLONG i; + if (uplo == 0) { + for (i = 0; i < n; i++){ + if (x[i] != ZERO) { + AXPYU_K(i + 1, 0, 0, alpha * x[i], x, 1, a, 1, NULL, 0); + } + a += lda; + } + } else { + for (i = 0; i < n; i++){ + if (x[i] != ZERO) { + AXPYU_K(n - i, 0, 0, alpha * x[i], x + i, 1, a, 1, NULL, 0); + } + a += 1 + lda; + } + } + return; + } +#endif if (incx < 0 ) x -= (n - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); diff --git a/interface/syr2.c b/interface/syr2.c index 08fd47e57..632906d28 100644 --- a/interface/syr2.c +++ b/interface/syr2.c @@ -170,6 +170,25 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, IDEBUG_START; + if (incx == 1 && incy == 1 && n < 100) { + blasint i; + if (!uplo) { + for (i = 0; i < n; i++){ + AXPYU_K(i + 1, 0, 0, alpha * x[i], y, 1, a, 1, NULL, 0); + AXPYU_K(i + 1, 0, 0, alpha * y[i], x, 1, a, 1, NULL, 0); + a += lda; + } + } else { + for (i = 0; i < n; i++){ + AXPYU_K(n - i, 0, 0, alpha * x[i], y + i, 1, a, 1, NULL, 0); + AXPYU_K(n - i, 0, 0, alpha * y[i], x + i, 1, a, 1, NULL, 0); + a += 1 + lda; + } + } + return; + } + + FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; diff --git a/interface/syrk.c b/interface/syrk.c index 7699db683..edb113d6c 100644 --- a/interface/syrk.c +++ b/interface/syrk.c @@ -354,6 +354,17 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr #endif args.common = NULL; +#ifndef COMPLEX +#ifdef DOUBLE + if (args.n < 100) +#else + if (args.n < 200) +#endif +#else + if (args.n < 65) +#endif + args.nthreads = 1; + else args.nthreads = num_cpu_avail(3); if (args.nthreads == 1) { diff --git a/interface/trsv.c b/interface/trsv.c index a054d8eeb..6a6e8f8ba 100644 --- a/interface/trsv.c +++ b/interface/trsv.c @@ -188,6 +188,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (n == 0) return; + if (incx == 1 && trans == 0 && n < 50) { + buffer = NULL; + (trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/zsyr.c b/interface/zsyr.c index 09b1de578..71d4dbf29 100644 --- a/interface/zsyr.c +++ b/interface/zsyr.c @@ -172,6 +172,32 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + if (incx == 1 && n < 50) { + blasint i; + if (!uplo) { + for (i = 0; i < n; i++){ + if ((x[i * 2 + 0] != ZERO) || (x[i * 2 + 1] != ZERO)) { + AXPYU_K(i + 1, 0, 0, + alpha_r * x[i * 2 + 0] - alpha_i * x[i * 2 + 1], + alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1], + x, 1, a, 1, NULL, 0); + } + a += lda; + } + } else { + for (i = 0; i < n; i++){ + if ((x[i * 2 + 0] != ZERO) || (x[i * 2 + 1] != ZERO)) { + AXPYU_K(n - i, 0, 0, + alpha_r * x[i * 2 + 0] - alpha_i * x[i * 2 + 1], + alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1], + x + i * 2, 1, a, 1, NULL, 0); + } + a += 2 + lda; + } + } + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/ztrsv.c b/interface/ztrsv.c index cbb7bba13..cf750b0b0 100644 --- a/interface/ztrsv.c +++ b/interface/ztrsv.c @@ -199,6 +199,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (n == 0) return; + if (incx == 1 && trans == 0 && n < 50) { + buffer = NULL; + (trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index d8d739965..2d9e3ec36 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -818,6 +818,8 @@ ifeq ($(OS), AIX) m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s +else ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ else $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ endif @@ -828,6 +830,8 @@ ifeq ($(OS), AIX) m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s +else ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ else $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ endif @@ -838,6 +842,8 @@ ifeq ($(OS), AIX) m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s +else ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ else $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ endif @@ -848,6 +854,8 @@ ifeq ($(OS), AIX) m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s +else ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ else $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ endif @@ -1044,6 +1052,8 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ endif @@ -1054,6 +1064,8 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ endif @@ -1064,6 +1076,8 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ endif @@ -1074,6 +1088,8 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ endif @@ -1084,6 +1100,8 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ endif @@ -1094,6 +1112,8 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ endif @@ -1104,6 +1124,8 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ endif @@ -1114,6 +1136,8 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ endif @@ -1187,29 +1211,55 @@ $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - +endif $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - +endif $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - +endif $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - +endif $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - +endif $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - +endif $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ endif +endif diff --git a/kernel/arm64/KERNEL.CORTEXA55 b/kernel/arm64/KERNEL.CORTEXA55 new file mode 100644 index 000000000..db322dd0d --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXA55 @@ -0,0 +1,196 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S +else +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +endif +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif +ifeq ($(SGEMM_UNROLL_M), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S +else +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c index b021a2832..fba2fe8ce 100644 --- a/kernel/arm64/dznrm2_thunderx2t99.c +++ b/kernel/arm64/dznrm2_thunderx2t99.c @@ -321,7 +321,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, : "cc", "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", - "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8" + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", REGINF ); } diff --git a/kernel/power/KERNEL.POWER5 b/kernel/power/KERNEL.POWER5 index fbef79e59..bea7b17c8 100644 --- a/kernel/power/KERNEL.POWER5 +++ b/kernel/power/KERNEL.POWER5 @@ -54,3 +54,8 @@ ZTRSMKERNEL_LN = ztrsm_kernel_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_RT.S + +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c +SGEMVNKERNEL = ../arm/gemv_n.c +SGEMVTKERNEL = ../arm/gemv_t.c diff --git a/kernel/power/KERNEL.PPC440 b/kernel/power/KERNEL.PPC440 index 677af5f21..fd9a8c780 100644 --- a/kernel/power/KERNEL.PPC440 +++ b/kernel/power/KERNEL.PPC440 @@ -16,11 +16,11 @@ ZASUMKERNEL = zasum_ppc440.S SAXPYKERNEL = axpy_ppc440.S DAXPYKERNEL = axpy_ppc440.S ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) -CAXPYKERNEL = ../arm/zaxpy.c -ZAXPYKERNEL = ../arm/zaxpy.c -else CAXPYKERNEL = zaxpy_ppc440.S ZAXPYKERNEL = zaxpy_ppc440.S +else +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c endif SDOTKERNEL = dot_ppc440.S diff --git a/kernel/power/KERNEL.PPCG4 b/kernel/power/KERNEL.PPCG4 index 54660b54d..1bdd3119e 100644 --- a/kernel/power/KERNEL.PPCG4 +++ b/kernel/power/KERNEL.PPCG4 @@ -15,8 +15,13 @@ ZASUMKERNEL = zasum_ppc440.S SAXPYKERNEL = axpy_ppc440.S DAXPYKERNEL = axpy_ppc440.S +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) CAXPYKERNEL = zaxpy_ppc440.S ZAXPYKERNEL = zaxpy_ppc440.S +else +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c +endif SDOTKERNEL = dot_ppc440.S DDOTKERNEL = dot_ppc440.S diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S index abc61b62e..9c6f87639 100644 --- a/kernel/power/gemv_n.S +++ b/kernel/power/gemv_n.S @@ -159,6 +159,11 @@ #define PREFETCHSIZE_C 16 #endif +#ifdef POWER3 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + #ifdef POWER4 #define PREFETCHSIZE_A 16 #define PREFETCHSIZE_C 16 diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S index 25a4dd01b..accdad702 100644 --- a/kernel/power/gemv_t.S +++ b/kernel/power/gemv_t.S @@ -124,6 +124,11 @@ #define PREFETCHSIZE_C 16 #endif +#ifdef POWER3 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + #ifdef POWER4 #define PREFETCHSIZE_A 48 #define PREFETCHSIZE_C 16 diff --git a/kernel/power/sbgemm_kernel_power10.c b/kernel/power/sbgemm_kernel_power10.c index d15586703..134929ec1 100644 --- a/kernel/power/sbgemm_kernel_power10.c +++ b/kernel/power/sbgemm_kernel_power10.c @@ -49,17 +49,11 @@ typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); -vector char mask = - { 0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xa, 0xb, 0x4, 0x5, 0xc, 0xd, 0x6, 0x7, 0xe, - 0xf -}; - /* * BFLOAT16 xvbf16ger2pp instruction needs 4×2 matrix of * bfloat16 floating-point values as input. Hence this * merging is needed on A and B matrices. */ -#define MERGE_ROW(x) vec_perm(x, x, mask) #define MERGE_HIGH(x, y) (vec_t) vec_mergeh ((vector short)x, (vector short)y) #define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y) @@ -104,6 +98,30 @@ vector char mask = rowC = (v2sf_t *) &CO[7* ldc+J]; \ rowC[0] += result[6] * alpha; + #define SAVE4x2_ACC_SCALAR(ACC) { \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + res[0] = result[0] * alpha; \ + res[1] = result[1] * alpha; \ + res[2] = result[2] * alpha; \ + res[3] = result[3] * alpha; \ + CO[0 * ldc] += res[0][0]; \ + CO[1 * ldc] += res[1][0]; \ + CO[2 * ldc] += res[2][0]; \ + CO[3 * ldc] += res[3][0]; \ + } + + #define SAVE4x2_ACC1_SCALAR(ACC) { \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + res[0] = result[0] * alpha; \ + res[1] = result[1] * alpha; \ + res[2] = result[2] * alpha; \ + res[3] = result[3] * alpha; \ + CO[4 * ldc] += res[0][0]; \ + CO[5 * ldc] += res[1][0]; \ + CO[6 * ldc] += res[2][0]; \ + CO[7 * ldc] += res[3][0]; \ +} + #define MMA __builtin_mma_xvbf16ger2pp #define SAVE2x4_ACC(ACC, J) \ @@ -179,8 +197,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 4; vec_t *rowA = (vec_t *) & (AO[l << 1]); vec_t *rowB = (vec_t *) & (BO[l]); - vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); - vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowB_h = MERGE_HIGH (rowB[0], vzero); + vec_t rowB_l = MERGE_LOW (rowB[0], vzero); vec_t rowA_h = MERGE_HIGH (rowA[0], vzero); vec_t rowA_l = MERGE_LOW (rowA[0], vzero); vec_t rowA2_h = MERGE_HIGH (rowA[1], vzero); @@ -231,8 +249,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 4; vec_t *rowA = (vec_t *) & (AO[l]); vec_t *rowB = (vec_t *) & (BO[l]); - vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); - vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowB_h = MERGE_HIGH (rowB[0], vzero); + vec_t rowB_l = MERGE_LOW (rowB[0], vzero); vec_t rowA_h = MERGE_HIGH (rowA[0], vzero); vec_t rowA_l = MERGE_LOW (rowA[0], vzero); MMA (&acc0, rowB_h, rowA_h); @@ -271,8 +289,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, vector short rowA = { AO[l + 0], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 }; vec_t *rowB = (vec_t *) & (BO[l << 1]); - MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); - MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA); } SAVE_ACC (&acc0, 0); SAVE_ACC1 (&acc1, 0); @@ -306,8 +324,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 2; vector short rowA = { AO[l + 0], 0, AO[l + 1], 0, 0, 0, 0, 0 }; vec_t *rowB = (vec_t *) & (BO[(l << 2)]); - MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); - MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA); } SAVE4x2_ACC (&acc0, 0); SAVE4x2_ACC1 (&acc1, 0); @@ -319,7 +337,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, { IFLOAT *BO = B; v2sf_t *rowC; - v2sf_t result[8]; + v4sf_t result[4], res[4]; __vector_quad acc0, acc1; __builtin_mma_xxsetaccz (&acc0); __builtin_mma_xxsetaccz (&acc1); @@ -338,11 +356,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 1; vector short rowA = { AO[l], 0, 0, 0, 0, 0, 0, 0 }; vec_t *rowB = (vec_t *) & (BO[(l << 3)]); - MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); - MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA); } - SAVE4x2_ACC (&acc0, 0); - SAVE4x2_ACC1 (&acc1, 0); + SAVE4x2_ACC_SCALAR (&acc0); + SAVE4x2_ACC1_SCALAR (&acc1); CO += 1; AO += k; BO += (k << 3); @@ -387,16 +405,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 3; vec_t *rowA = (vec_t *) & (AO[(l << 2)]); vec_t *rowA1 = (vec_t *) & (A1[(l << 2)]); - vec_t *rowB = (vec_t *) & (BO[l]); - vec_t rowB_mrg = MERGE_ROW (rowB[0]); - MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); - MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); - MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero)); - MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero)); - MMA (&acc4, rowB_mrg, MERGE_HIGH (rowA1[0], vzero)); - MMA (&acc5, rowB_mrg, MERGE_LOW (rowA1[0], vzero)); - MMA (&acc6, rowB_mrg, MERGE_HIGH (rowA1[1], vzero)); - MMA (&acc7, rowB_mrg, MERGE_LOW (rowA1[1], vzero)); + vector short rowB_mrg = + { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 }; + MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t)rowB_mrg, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, (vec_t)rowB_mrg, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, (vec_t)rowB_mrg, MERGE_LOW (rowA[1], vzero)); + MMA (&acc4, (vec_t)rowB_mrg, MERGE_HIGH (rowA1[0], vzero)); + MMA (&acc5, (vec_t)rowB_mrg, MERGE_LOW (rowA1[0], vzero)); + MMA (&acc6, (vec_t)rowB_mrg, MERGE_HIGH (rowA1[1], vzero)); + MMA (&acc7, (vec_t)rowB_mrg, MERGE_LOW (rowA1[1], vzero)); } SAVE_ACC (&acc0, 0); @@ -436,12 +454,12 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (k > 1) l = (k / 2) << 3; vec_t *rowA = (vec_t *) & (AO[(l << 2)]); - vec_t *rowB = (vec_t *) & (BO[l]); - vec_t rowB_mrg = MERGE_ROW (rowB[0]); - MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); - MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); - MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero)); - MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero)); + vector short rowB_mrg = + { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 }; + MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t)rowB_mrg, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, (vec_t)rowB_mrg, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, (vec_t)rowB_mrg, MERGE_LOW (rowA[1], vzero)); } SAVE_ACC (&acc0, 0); @@ -475,9 +493,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 3; vec_t *rowA = (vec_t *) & (AO[l << 1]); vec_t *rowB = (vec_t *) & (BO[l]); - vec_t rowB_mrg = MERGE_ROW (rowB[0]); - MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); - MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); + vector short rowB_mrg = + { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 }; + MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t)rowB_mrg, MERGE_LOW (rowA[0], vzero)); } SAVE_ACC (&acc0, 0); SAVE_ACC (&acc1, 4); @@ -505,8 +524,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 3; vector short rowA = { AO[l], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 }; - vec_t *rowB = (vec_t *) & (BO[l]); - MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + vector short rowB_mrg = + { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 }; + MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA); } SAVE_ACC (&acc0, 0); CO += 4; @@ -536,8 +556,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (k > 1) l = (k / 2) << 2; vector short rowA = { AO[l], 0, AO[l + 1], 0, 0, 0, 0, 0 }; - vec_t *rowB = (vec_t *) & (BO[l << 1]); - MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + vector short rowB_mrg = + { BO[(l<<1)], 0, BO[(l<<1) + 1], 0, BO[(l<<1) + 2], 0, + BO[(l<<1) + 3], 0 + }; + MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA); } SAVE4x2_ACC (&acc0, 0); CO += 2; @@ -548,7 +571,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, { IFLOAT *BO = B; v2sf_t *rowC; - v2sf_t result[8]; + v4sf_t result[4], res[4]; __vector_quad acc0; BLASLONG l = 0; __builtin_mma_xxsetaccz (&acc0); @@ -566,10 +589,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (k > 1) l = (k / 2) << 1; vector short rowA = { AO[l], 0, 0, 0, 0, 0, 0, 0 }; - vec_t *rowB = (vec_t *) & (BO[l << 2]); - MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + vector short rowB_mrg = + { BO[(l<<2) + 0], 0, BO[(l<<2) + 1], 0, BO[(l <<2) + 2], 0, + BO[(l<<2) + 3], 0 + }; + MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA); } - SAVE4x2_ACC (&acc0, 0); + SAVE4x2_ACC_SCALAR (&acc0); AO += k; BO += (k << 2); CO += 1; @@ -620,14 +646,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[l << 3]); vec_t *rowA1 = (vec_t *) & (A1[l << 3]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); - MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); - MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); - MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], rowA1[2])); - MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], rowA1[2])); - MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], rowA1[3])); - MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], rowA1[3])); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], vzero)); + MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], vzero)); + MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], vzero)); + MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], vzero)); + MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], vzero)); } SAVE2x4_ACC (&acc0, 0); SAVE2x4_ACC (&acc1, 4); @@ -669,10 +695,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 2; vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[l << 3]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); - MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); - MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero )); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], vzero)); } SAVE2x4_ACC (&acc0, 0); SAVE2x4_ACC (&acc1, 4); @@ -708,8 +734,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 2; vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[(l << 2)]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1])); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); } SAVE2x4_ACC (&acc0, 0); SAVE2x4_ACC (&acc1, 4); @@ -740,8 +766,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (k > 1) l = (k / 2) << 2; vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; - vec_t *rowA = (vec_t *) & (AO[l << 1]); - MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0])); + vector short rowA = + { AO[(l << 1)], 0, AO[(l << 1) + 1] , 0 , AO[(l<<1) + 2], + 0, AO[(l << 1) + 3], 0 }; + MMA (&acc0, (vec_t) rowB, (vec_t)(rowA)); } SAVE2x4_ACC (&acc0, 0); CO += 4; @@ -829,10 +857,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 1; vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[(l << 4)]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); - MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); - MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], vzero)); } rowC = (v4sf_t *) &CO[0]; __builtin_mma_disassemble_acc ((void *)result, &acc0); @@ -871,8 +899,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 1; vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[(l << 3)]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1])); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); } rowC = (v4sf_t *) &CO[0]; __builtin_mma_disassemble_acc ((void *)result, &acc0); @@ -904,8 +932,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (k > 1) l = (k / 2) << 1; vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 }; - vec_t *rowA = (vec_t *) & (AO[(l << 2)]); - MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0])); + vector short rowA = + { AO[(l << 2)], 0, AO[(l << 2) + 1] , 0 , + AO[(l << 2) + 2], 0, AO[(l << 2) + 3], 0 }; + MMA (&acc0, (vec_t) rowB, (vec_t)(rowA)); } rowC = (v4sf_t *) &CO[0]; __builtin_mma_disassemble_acc ((void *)result, &acc0); diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S index 708f1318d..48f49f97b 100644 --- a/kernel/power/zgemv_n.S +++ b/kernel/power/zgemv_n.S @@ -155,6 +155,11 @@ #define PREFETCHSIZE_C 16 #endif +#ifdef POWER3 +#define PREFETCHSIZE_A 34 +#define PREFETCHSIZE_C 16 +#endif + #ifdef POWER4 #define PREFETCHSIZE_A 34 #define PREFETCHSIZE_C 16 diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S index d82fab16a..314cf5e6e 100644 --- a/kernel/power/zgemv_t.S +++ b/kernel/power/zgemv_t.S @@ -129,6 +129,11 @@ #define PREFETCHSIZE_C 16 #endif +#ifdef POWER3 +#define PREFETCHSIZE_A 34 +#define PREFETCHSIZE_C 16 +#endif + #ifdef POWER4 #define PREFETCHSIZE_A 34 #define PREFETCHSIZE_C 16 diff --git a/kernel/riscv64/KERNEL.RISCV64_GENERIC b/kernel/riscv64/KERNEL.RISCV64_GENERIC index ea6a8cf21..61a8a2b91 100644 --- a/kernel/riscv64/KERNEL.RISCV64_GENERIC +++ b/kernel/riscv64/KERNEL.RISCV64_GENERIC @@ -54,6 +54,7 @@ SDOTKERNEL = ../riscv64/dot.c DDOTKERNEL = ../riscv64/dot.c CDOTKERNEL = ../riscv64/zdot.c ZDOTKERNEL = ../riscv64/zdot.c +DSDOTKERNEL = ../generic/dot.c SNRM2KERNEL = ../riscv64/nrm2.c DNRM2KERNEL = ../riscv64/nrm2.c diff --git a/kernel/riscv64/Makefile b/kernel/riscv64/Makefile new file mode 100644 index 000000000..520349bd6 --- /dev/null +++ b/kernel/riscv64/Makefile @@ -0,0 +1 @@ +clean :: diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c index b6aec131e..5312f9ef0 100644 --- a/kernel/riscv64/amax_vector.c +++ b/kernel/riscv64/amax_vector.c @@ -29,29 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMAXVV_FLOAT vfmax_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMAXVV_FLOAT vfmax_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -62,19 +66,25 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(maxf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; + FLOAT_V_T_M1 v_res, v_zero; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_zero = VFMVVF_FLOAT_M1(0, gvl); MASK_T mask0, mask1; FLOAT zero = 0.0; if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ v_max = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i maxf) - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; j += gvl; } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); BLASLONG stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/2){ BLASLONG inc_xv = inc_x * gvl; @@ -162,6 +175,7 @@ asm volatile( //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); #if defined(DOUBLE) asm volatile( + "vsetvli zero, zero, e8, m1\n\t" "vor.vv v0, %1, %1\n\t" "vsetvli x0, %3, e64,m8 \n\t" "vfrsub.vf %0, %0, %2, v0.t \n\t" @@ -170,6 +184,7 @@ asm volatile( :"v0"); #else asm volatile( + "vsetvli zero, zero, e8, m1\n\t" "vor.vv v0, %1, %1\n\t" "vsetvli x0, %3, e32,m8 \n\t" "vfrsub.vf %0, %0, %2, v0.t \n\t" @@ -185,6 +200,7 @@ asm volatile( //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); #if defined(DOUBLE) asm volatile( + "vsetvli zero, zero, e8, m1\n\t" "vor.vv v0, %1, %1\n\t" "vsetvli x0, %3, e64,m8 \n\t" "vfrsub.vf %0, %0, %2, v0.t \n\t" @@ -193,6 +209,7 @@ asm volatile( :"v0"); #else asm volatile( + "vsetvli zero, zero, e8, m1\n\t" "vor.vv v0, %1, %1\n\t" "vsetvli x0, %3, e32,m8 \n\t" "vfrsub.vf %0, %0, %2, v0.t \n\t" @@ -205,17 +222,17 @@ asm volatile( j += gvl*2; ix += inc_xv*2; } - v0 = VFMVVF_FLOAT(0, gvl); - v0 = VFREDMAXVS_FLOAT(v_max, v0, gvl); - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl); + maxf = v_res[0]; } for(;j maxf) - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; j += gvl; } } diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c index 53243ad56..ae2867ef8 100644 --- a/kernel/riscv64/amin_vector.c +++ b/kernel/riscv64/amin_vector.c @@ -30,29 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMINVV_FLOAT vfmin_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMINVV_FLOAT vfmin_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -62,11 +66,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); MASK_T mask0, mask1; - FLOAT zero = 0.0; + FLOAT zero = 0.0; if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDSUMVS_FLOAT vfredsum_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFADDVV_FLOAT vfadd_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDSUMVS_FLOAT vfredsum_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFADDVV_FLOAT vfadd_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -61,39 +65,43 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_zero,v_sum; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); MASK_T mask0, mask1; if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_zero = VFMVVF_FLOAT(0, gvl); if(gvl <= n/2){ v_sum = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i 0){ - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDSUM_FLOAT(vr, vx, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } //tail if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLEV_FLOAT(&x[j], gvl); vy = VLEV_FLOAT(&y[j], gvl); FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - vx = VFREDSUM_FLOAT(vr, vz, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } }else if(inc_y == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); unsigned int stride_x = inc_x * sizeof(FLOAT); for(i=0,j=0; i 0){ - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDSUM_FLOAT(vr, vx, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } //tail if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); vy = VLEV_FLOAT(&y[j], gvl); FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - vx = VFREDSUM_FLOAT(vr, vz, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } }else if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); unsigned int stride_y = inc_y * sizeof(FLOAT); for(i=0,j=0; i 0){ - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDSUM_FLOAT(vr, vx, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } //tail if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLEV_FLOAT(&x[j], gvl); vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - vx = VFREDSUM_FLOAT(vr, vz, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int stride_y = inc_y * sizeof(FLOAT); @@ -150,20 +156,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) j += gvl; } if(j > 0){ - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDSUM_FLOAT(vr, vx, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } //tail if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - vx = VFREDSUM_FLOAT(vr, vz, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } } return(dot); diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c index bd4d23eae..32ca8618b 100644 --- a/kernel/riscv64/gemv_n_vector.c +++ b/kernel/riscv64/gemv_n_vector.c @@ -27,23 +27,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M4 -#define FLOAT_V_T float32xm4_t -#define VLEV_FLOAT vlev_float32xm4 -#define VLSEV_FLOAT vlsev_float32xm4 -#define VSEV_FLOAT vsev_float32xm4 -#define VSSEV_FLOAT vssev_float32xm4 -#define VFMACCVF_FLOAT vfmaccvf_float32xm4 +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT vle_v_f32m4 +#define VLSEV_FLOAT vlse_v_f32m4 +#define VSEV_FLOAT vse_v_f32m4 +#define VSSEV_FLOAT vsse_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M4 -#define FLOAT_V_T float64xm4_t -#define VLEV_FLOAT vlev_float64xm4 -#define VLSEV_FLOAT vlsev_float64xm4 -#define VSEV_FLOAT vsev_float64xm4 -#define VSSEV_FLOAT vssev_float64xm4 -#define VFMACCVF_FLOAT vfmaccvf_float64xm4 +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT vle_v_f64m4 +#define VLSEV_FLOAT vlse_v_f64m4 +#define VSEV_FLOAT vse_v_f64m4 +#define VSSEV_FLOAT vsse_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -57,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT_V_T va0, va1, vy0, vy1; unsigned int gvl = 0; if(inc_y == 1){ - gvl = vsetvli(m, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m); if(gvl <= m/2){ for(k=0,j=0; k maxf){ //tail index v_max_index = VIDV_UINT(gvl); @@ -135,7 +142,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; @@ -145,35 +152,33 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); - v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; idx += inc_v; } - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); + maxf = v_res[0]; mask = VMFGEVF_FLOAT(v_max, maxf, gvl); max_index = VMFIRSTM(mask,gvl); max_index = v_max_index[max_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - FLOAT cur_maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); + FLOAT cur_maxf = v_res[0]; if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c index 608f19a00..5bcffece5 100644 --- a/kernel/riscv64/iamin_vector.c +++ b/kernel/riscv64/iamin_vector.c @@ -32,49 +32,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #define ABS fabs -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 -#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 -#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif @@ -89,42 +93,45 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_min_index; MASK_T mask; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_min = VFMVVF_FLOAT(FLT_MAX, gvl); v_min_index = VMVVX_UINT(0, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); //index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; } - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLEV_FLOAT(&x[j], gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - FLOAT cur_minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); @@ -136,7 +143,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; @@ -146,35 +153,33 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); //index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; idx += inc_v; } - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - FLOAT cur_minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c index 44af7101b..42705f5de 100644 --- a/kernel/riscv64/imax_vector.c +++ b/kernel/riscv64/imax_vector.c @@ -32,45 +32,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #define ABS fabs -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 -#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 -#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif @@ -85,8 +89,13 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_max_index; MASK_T mask; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_min; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); + if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_max_index = VMVVX_UINT(0, gvl); v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); for(i=0,j=0; i < n/gvl; i++){ @@ -94,27 +103,25 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); - v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; } - vx = VFMVVF_FLOAT(-FLT_MAX, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + maxf = v_res[0]; mask = VMFGEVF_FLOAT(v_max, maxf, gvl); max_index = VMFIRSTM(mask,gvl); max_index = v_max_index[max_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_max = VLEV_FLOAT(&x[j], gvl); - vx = VFMVVF_FLOAT(-FLT_MAX, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - FLOAT cur_maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + FLOAT cur_maxf = v_res[0]; if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); @@ -126,7 +133,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; @@ -137,28 +144,26 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); - v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; idx += inc_v; } - vx = VFMVVF_FLOAT(-FLT_MAX, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + maxf = v_res[0]; mask = VMFGEVF_FLOAT(v_max, maxf, gvl); max_index = VMFIRSTM(mask,gvl); max_index = v_max_index[max_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); - vx = VFMVVF_FLOAT(-FLT_MAX, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - FLOAT cur_maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + FLOAT cur_maxf = v_res[0]; if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/imin.c b/kernel/riscv64/imin.c index 598cba387..ffc65226e 100644 --- a/kernel/riscv64/imin.c +++ b/kernel/riscv64/imin.c @@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( x[ix] > minf ) + if( x[ix] < minf ) { min = i; minf = x[ix]; diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c index e6e0e9f9f..3afa74dd6 100644 --- a/kernel/riscv64/imin_vector.c +++ b/kernel/riscv64/imin_vector.c @@ -32,45 +32,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #define ABS fabs -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 -#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 -#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif @@ -85,15 +89,20 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_min_index; MASK_T mask; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_min = VFMVVF_FLOAT(FLT_MAX, gvl); v_min_index = VMVVX_UINT(0, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); //index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); /* #if defined(DOUBLE) asm volatile( @@ -113,26 +122,24 @@ asm volatile( :"v0"); #endif */ - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; } - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_min = VLEV_FLOAT(&x[j], gvl); - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - FLOAT cur_minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); @@ -143,7 +150,7 @@ asm volatile( } } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; @@ -154,7 +161,7 @@ asm volatile( //index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); /* #if defined(DOUBLE) asm volatile( @@ -175,27 +182,25 @@ asm volatile( #endif */ - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; idx += inc_v; } - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - FLOAT cur_minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c index 62c95d973..ddb5eabde 100644 --- a/kernel/riscv64/izamax_vector.c +++ b/kernel/riscv64/izamax_vector.c @@ -30,47 +30,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define RVV_EFLOAT RVV_E64 -#define FLOAT_V_T float64xm8_t -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 -#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define FLOAT_V_T float32xm8_t -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 -#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif #define RVV_M RVV_M8 @@ -86,7 +92,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_max_index; MASK_T mask0, mask1; unsigned int gvl = 0; - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + gvl = VSETVL(n); v_max_index = VMVVX_UINT(0, gvl); v_max = VFMVVF_FLOAT(-1, gvl); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); @@ -96,7 +107,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); //fabs(vector) mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -119,7 +130,7 @@ asm volatile( vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); //fabs(vector) mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -143,7 +154,7 @@ asm volatile( //index where element greater than v_max mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl); - v_max_index = VIDV_MASK_UINT(v_max_index, mask0, gvl); + v_max_index = VIDV_MASK_UINT(mask0, v_max_index, gvl); /* #if defined(DOUBLE) asm volatile( @@ -163,7 +174,7 @@ asm volatile( :"v0"); #endif */ - v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask0, gvl); + v_max_index = VADDVX_MASK_UINT(mask0, v_max_index, v_max_index, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx0, gvl); @@ -171,19 +182,19 @@ asm volatile( ix += inc_xv; } vx0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl); - maxf = vx0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); + maxf = v_res[0]; mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl); max_index = VMFIRSTM(mask0,gvl); max_index = v_max_index[max_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_max_index = VMVVX_UINT(0, gvl); vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); //fabs(vector) mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -206,7 +217,7 @@ asm volatile( vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); //fabs(vector) mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -227,9 +238,8 @@ asm volatile( #endif */ v_max = VFADDVV_FLOAT(vx0, vx1, gvl); - vx0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl); - FLOAT cur_maxf = vx0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); + FLOAT cur_maxf = v_res[0]; if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c index 38eccf1b5..6e328dc31 100644 --- a/kernel/riscv64/izamin_vector.c +++ b/kernel/riscv64/izamin_vector.c @@ -31,50 +31,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define RVV_EFLOAT RVV_E64 -#define FLOAT_V_T float64xm8_t -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 -#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define FLOAT_V_T float32xm8_t -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 -#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif -#define RVV_M RVV_M8 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -87,7 +92,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_min_index; MASK_T mask0, mask1; unsigned int gvl = 0; - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + + gvl = VSETVL(n); v_min_index = VMVVX_UINT(0, gvl); v_min = VFMVVF_FLOAT(FLT_MAX, gvl); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); @@ -97,7 +107,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); //fabs(vector) mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -120,7 +130,7 @@ asm volatile( vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); //fabs(vector) mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -144,7 +154,7 @@ asm volatile( //index where element less than v_min mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask0, gvl); + v_min_index = VIDV_MASK_UINT(mask0, v_min_index, gvl); /* #if defined(DOUBLE) asm volatile( @@ -164,27 +174,26 @@ asm volatile( :"v0"); #endif */ - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask0, gvl); + v_min_index = VADDVX_MASK_UINT(mask0, v_min_index, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx0, gvl); j += gvl; ix += inc_xv; } - vx0 = VFMVVF_FLOAT(FLT_MAX, gvl); - vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl); - minf = vx0[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask0 = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask0,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_min_index = VMVVX_UINT(0, gvl); vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); //fabs(vector) mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -207,7 +216,7 @@ asm volatile( vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); //fabs(vector) mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -228,9 +237,8 @@ asm volatile( #endif */ v_min = VFADDVV_FLOAT(vx0, vx1, gvl); - vx0 = VFMVVF_FLOAT(FLT_MAX, gvl); - vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl); - FLOAT cur_minf = vx0[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c index 4ef75452d..0fc59b74c 100644 --- a/kernel/riscv64/max_vector.c +++ b/kernel/riscv64/max_vector.c @@ -29,23 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -55,9 +59,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT maxf=-FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; + FLOAT_V_T_M1 v_res, v_min; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); for(i=0,j=0; i maxf) - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; j += gvl; } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); BLASLONG stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/2){ v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); @@ -96,17 +102,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) j += gvl * 2; idx += inc_xv * 2; } - v1 = VFMVVF_FLOAT(-FLT_MAX, gvl); - v0 = VFREDMAXVS_FLOAT(v_max, v1, gvl); - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + maxf = v_res[0]; } for(;j maxf) - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; j += gvl; } } diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c index 83c965bfa..8223fa87a 100644 --- a/kernel/riscv64/min_vector.c +++ b/kernel/riscv64/min_vector.c @@ -29,23 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -55,9 +59,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VSEV_FLOAT vsev_float32xm8 -#define VSSEV_FLOAT vssev_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VSEV_FLOAT vse_v_f32m8 +#define VSSEV_FLOAT vsse_v_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VSEV_FLOAT vsev_float64xm8 -#define VSSEV_FLOAT vssev_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VSEV_FLOAT vse_v_f64m8 +#define VSSEV_FLOAT vsse_v_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) @@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if (n < 0) return(0); if(inc_x == 1 && inc_y == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ for(i=0,j=0; i 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < len / gvl; k++){ va = VLEV_FLOAT(&a_ptr[i], gvl); @@ -89,11 +97,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); vy = VLEV_FLOAT(&y[i], gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -101,9 +108,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[j] += alpha * temp2; @@ -121,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i = j + 1; len = m - i; if(len > 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); inc_yv = inc_y * gvl; vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < len / gvl; k++){ @@ -136,11 +142,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; iy += inc_yv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -148,9 +153,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[jy] += alpha * temp2; @@ -169,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i = j + 1; len = m - i; if(len > 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); vr = VFMVVF_FLOAT(0, gvl); inc_xv = inc_x * gvl; for(k = 0; k < len / gvl; k++){ @@ -184,11 +188,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; ix += inc_xv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); vy = VLEV_FLOAT(&y[i], gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -196,9 +199,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[j] += alpha * temp2; @@ -220,7 +222,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i = j + 1; len = m - i; if(len > 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); inc_xv = inc_x * gvl; inc_yv = inc_y * gvl; vr = VFMVVF_FLOAT(0, gvl); @@ -237,11 +239,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; iy += inc_yv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -249,9 +250,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[jy] += alpha * temp2; diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c index 29e0e4b65..7229a48b1 100644 --- a/kernel/riscv64/symv_U_vector.c +++ b/kernel/riscv64/symv_U_vector.c @@ -27,33 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M4 -#define FLOAT_V_T float32xm4_t -#define VLEV_FLOAT vlev_float32xm4 -#define VLSEV_FLOAT vlsev_float32xm4 -#define VSEV_FLOAT vsev_float32xm4 -#define VSSEV_FLOAT vssev_float32xm4 -#define VFREDSUM_FLOAT vfredsumvs_float32xm4 -#define VFMACCVV_FLOAT vfmaccvv_float32xm4 -#define VFMACCVF_FLOAT vfmaccvf_float32xm4 -#define VFMVVF_FLOAT vfmvvf_float32xm4 -#define VFDOTVV_FLOAT vfdotvv_float32xm4 -#define VFMULVV_FLOAT vfmulvv_float32xm4 +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m4 +#define VLSEV_FLOAT vlse_v_f32m4 +#define VSEV_FLOAT vse_v_f32m4 +#define VSSEV_FLOAT vsse_v_f32m4 +#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFDOTVV_FLOAT vfdot_vv_f32m4 +#define VFMULVV_FLOAT vfmul_vv_f32m4 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M4 -#define FLOAT_V_T float64xm4_t -#define VLEV_FLOAT vlev_float64xm4 -#define VLSEV_FLOAT vlsev_float64xm4 -#define VSEV_FLOAT vsev_float64xm4 -#define VSSEV_FLOAT vssev_float64xm4 -#define VFREDSUM_FLOAT vfredsumvs_float64xm4 -#define VFMACCVV_FLOAT vfmaccvv_float64xm4 -#define VFMACCVF_FLOAT vfmaccvf_float64xm4 -#define VFMVVF_FLOAT vfmvvf_float64xm4 -#define VFDOTVV_FLOAT vfdotvv_float64xm4 -#define VFMULVV_FLOAT vfmulvv_float64xm4 +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m4 +#define VLSEV_FLOAT vlse_v_f64m4 +#define VSEV_FLOAT vse_v_f64m4 +#define VSSEV_FLOAT vsse_v_f64m4 +#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFDOTVV_FLOAT vfdot_vv_f64m4 +#define VFMULVV_FLOAT vfmul_vv_f64m4 #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -65,6 +69,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA FLOAT temp2; FLOAT *a_ptr = a; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); FLOAT_V_T va, vx, vy, vr; BLASLONG stride_x, stride_y, inc_xv, inc_yv; @@ -78,7 +86,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA temp2 = 0.0; if(j > 0){ i = 0; - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < j / gvl; k++){ vy = VLEV_FLOAT(&y[i], gvl); @@ -91,11 +99,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); vy = VLEV_FLOAT(&y[i], gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -103,9 +110,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[j] += temp1 * a_ptr[j] + alpha * temp2; @@ -122,7 +128,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA if(j > 0){ iy = 0; i = 0; - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); inc_yv = inc_y * gvl; vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < j / gvl; k++){ @@ -137,11 +143,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; iy += inc_yv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -149,9 +154,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[jy] += temp1 * a_ptr[j] + alpha * temp2; @@ -169,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA if(j > 0){ ix = 0; i = 0; - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); inc_xv = inc_x * gvl; vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < j / gvl; k++){ @@ -184,11 +188,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; ix += inc_xv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); vy = VLEV_FLOAT(&y[i], gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -196,9 +199,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[j] += temp1 * a_ptr[j] + alpha * temp2; @@ -219,7 +221,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix = 0; iy = 0; i = 0; - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); inc_xv = inc_x * gvl; inc_yv = inc_y * gvl; vr = VFMVVF_FLOAT(0, gvl); @@ -236,11 +238,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; iy += inc_yv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -248,9 +249,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[jy] += temp1 * a_ptr[j] + alpha * temp2; diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c index a6c742b14..5cd65b225 100644 --- a/kernel/riscv64/zamax_vector.c +++ b/kernel/riscv64/zamax_vector.c @@ -29,29 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -62,19 +66,23 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(maxf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); MASK_T mask0, mask1; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_max = VFMVVF_FLOAT(0, gvl); BLASLONG inc_xv = inc_x * gvl * 2; for(; i maxf) - maxf = v_max[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v1, v_z0, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; } return(maxf); } diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c index 44a7cf1dc..9d567b3da 100644 --- a/kernel/riscv64/zamin_vector.c +++ b/kernel/riscv64/zamin_vector.c @@ -30,29 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -63,18 +67,23 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + MASK_T mask0, mask1; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_min = VFMVVF_FLOAT(FLT_MAX, gvl); BLASLONG inc_xv = inc_x * gvl * 2; for(; i #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDSUMVS_FLOAT vfredsum_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFADDVV_FLOAT vfadd_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDSUMVS_FLOAT vfredsum_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFADDVV_FLOAT vfadd_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -61,40 +65,44 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_zero,v_sum; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); MASK_T mask0, mask1; if(inc_x == 1){ BLASLONG n2 = n * 2; - gvl = vsetvli(n2, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n2); v_zero = VFMVVF_FLOAT(0, gvl); if(gvl <= n2/2){ v_sum = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); inc_xv = incx * gvl * 2; inc_yv = incy * gvl * 2; inc_av = gvl * 2; @@ -134,13 +141,12 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B iy += inc_yv; ia += inc_av; } - va0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); - temp_r2 = vx0[0]; - vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); - temp_i2 = vx1[0]; + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + temp_r2 = v_res[0]; + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + temp_i2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); @@ -173,11 +179,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); #endif - va0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); - temp_r2 += vx0[0]; - vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); - temp_i2 += vx1[0]; + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + temp_r2 += v_res[0]; + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + temp_i2 += v_res[0]; } } y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c index 6fe12c76c..40cd9cd64 100644 --- a/kernel/riscv64/zhemv_UV_vector.c +++ b/kernel/riscv64/zhemv_UV_vector.c @@ -27,31 +27,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M4 -#define FLOAT_V_T float32xm4_t -#define VLSEV_FLOAT vlsev_float32xm4 -#define VSSEV_FLOAT vssev_float32xm4 -#define VFREDSUM_FLOAT vfredsumvs_float32xm4 -#define VFMACCVV_FLOAT vfmaccvv_float32xm4 -#define VFMACCVF_FLOAT vfmaccvf_float32xm4 -#define VFMVVF_FLOAT vfmvvf_float32xm4 -#define VFMULVV_FLOAT vfmulvv_float32xm4 -#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4 -#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4 +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m4 +#define VSSEV_FLOAT vsse_v_f32m4 +#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMULVV_FLOAT vfmul_vv_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M4 -#define FLOAT_V_T float64xm4_t -#define VLSEV_FLOAT vlsev_float64xm4 -#define VSSEV_FLOAT vssev_float64xm4 -#define VFREDSUM_FLOAT vfredsumvs_float64xm4 -#define VFMACCVV_FLOAT vfmaccvv_float64xm4 -#define VFMACCVF_FLOAT vfmaccvf_float64xm4 -#define VFMVVF_FLOAT vfmvvf_float64xm4 -#define VFMULVV_FLOAT vfmulvv_float64xm4 -#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4 -#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4 +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m4 +#define VSSEV_FLOAT vsse_v_f64m4 +#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMULVV_FLOAT vfmul_vv_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ @@ -62,7 +66,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B FLOAT temp_r2, temp_i2; FLOAT *a_ptr = a; unsigned int gvl = 0; - + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2; @@ -89,7 +96,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B ia = 0; i = 0; if(j > 0){ - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); inc_xv = incx * gvl * 2; inc_yv = incy * gvl * 2; inc_av = gvl * 2; @@ -133,13 +140,12 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B iy += inc_yv; ia += inc_av; } - va0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); - temp_r2 = vx0[0]; - vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); - temp_i2 = vx1[0]; + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + temp_r2 = v_res[0]; + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + temp_i2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); @@ -172,11 +178,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); #endif - va0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); - temp_r2 += vx0[0]; - vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); - temp_i2 += vx1[0]; + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + temp_r2 += v_res[0]; + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + temp_i2 += v_res[0]; } } y[jy] += temp_r1 * a_ptr[ja]; diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c index b0ebfa5f4..5ac62eb80 100644 --- a/kernel/riscv64/znrm2_vector.c +++ b/kernel/riscv64/znrm2_vector.c @@ -27,41 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M4 -#define FLOAT_V_T float32xm4_t -#define VLEV_FLOAT vlev_float32xm4 -#define VLSEV_FLOAT vlsev_float32xm4 -#define VFREDSUM_FLOAT vfredsumvs_float32xm4 -#define VFMACCVV_FLOAT vfmaccvv_float32xm4 -#define VFMVVF_FLOAT vfmvvf_float32xm4 -#define VFDOTVV_FLOAT vfdotvv_float32xm4 +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m4 +#define VLSEV_FLOAT vlse_v_f32m4 +#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFDOTVV_FLOAT vfdot_vv_f32m4 #define ABS fabsf -#define MASK_T e32xm4_t -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4 -#define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4 -#define VMFIRSTM vmfirstm_e32xm4 -#define VFDIVVF_FLOAT vfdivvf_float32xm4 -#define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4 +#define MASK_T vbool8_t +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m +#define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8 +#define VMFIRSTM vmfirst_m_b8 +#define VFDIVVF_FLOAT vfdiv_vf_f32m4 +#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M4 -#define FLOAT_V_T float64xm4_t -#define VLEV_FLOAT vlev_float64xm4 -#define VLSEV_FLOAT vlsev_float64xm4 -#define VFREDSUM_FLOAT vfredsumvs_float64xm4 -#define VFMACCVV_FLOAT vfmaccvv_float64xm4 -#define VFMVVF_FLOAT vfmvvf_float64xm4 -#define VFDOTVV_FLOAT vfdotvv_float64xm4 +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m4 +#define VLSEV_FLOAT vlse_v_f64m4 +#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFDOTVV_FLOAT vfdot_vv_f64m4 #define ABS fabs -#define MASK_T e64xm4_t -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4 -#define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4 -#define VMFIRSTM vmfirstm_e64xm4 -#define VFDIVVF_FLOAT vfdivvf_float64xm4 -#define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4 +#define MASK_T vbool16_t +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m +#define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16 +#define VMFIRSTM vmfirst_m_b16 +#define VFDIVVF_FLOAT vfdiv_vf_f64m4 +#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -73,19 +77,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT_V_T vr, v0, v_zero; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT scale = 0.0, ssq = 0.0; MASK_T mask; BLASLONG index = 0; if(inc_x == 1){ BLASLONG n2 = n * 2; - gvl = vsetvli(n2, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n2); vr = VFMVVF_FLOAT(0, gvl); v_zero = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VSEV_FLOAT vsev_float32xm8 -#define VSSEV_FLOAT vssev_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VSEV_FLOAT vse_v_f32m8 +#define VSSEV_FLOAT vsse_v_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VSEV_FLOAT vsev_float64xm8 -#define VSSEV_FLOAT vssev_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VSEV_FLOAT vse_v_f64m8 +#define VSSEV_FLOAT vsse_v_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) @@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm if (n < 0) return(0); if(inc_x == 1 && inc_y == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); BLASLONG n2 = n * 2; if(gvl <= n2/2){ for(i=0,j=0; i #if defined(BULLDOZER) diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c index 9f2bf24e2..15185d7fc 100644 --- a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c @@ -149,6 +149,7 @@ #define KERNEL_h_k1m16n2 \ "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; vmovddup 64(%0),%%zmm3; vmovddup 72(%0),%%zmm4; addq $128,%0;"\ unit_acc_m16n2(8,9,10,11,%1) + #endif #define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;" #define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "prefetcht0 384(%0);" unit_acc_m16n2(12,13,14,15,%1,%%r12,1) @@ -283,7 +284,32 @@ #define KERNEL_h_k1m4n10 KERNEL_h_k1m4n8 unit_acc_m4n2(12,13,%%r15,%%r12,1) #define KERNEL_k1m4n10 KERNEL_h_k1m4n10 "addq $16,%%r15;" #define KERNEL_h_k1m4n12 KERNEL_h_k1m4n10 unit_acc_m4n2(14,15,%%r15,%%r12,2) -#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;" +//#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;" +#define unit_acc_k2m4n2(c1_no,c2_no,...)\ + "vbroadcastf64x4 ("#__VA_ARGS__"),%%zmm3; vpermpd %%zmm3,%%zmm30,%%zmm3;"\ + "vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";" + +#define unit_merge_to_ymm(c1_no) \ + "vextractf64x4 $1,%%zmm"#c1_no",%%ymm30; vaddpd %%ymm"#c1_no",%%ymm30,%%ymm"#c1_no";" + +#define KERNEL_k1m4n12 \ + "cmpq $2, %5; jb 104912f;"\ + "vmovupd 64+%11,%%zmm30;"\ + "\n204912:"\ + "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; addq $64,%0;" \ + unit_acc_k2m4n2(4,5,%1) unit_acc_k2m4n2(6,7,%1,%%r12,1) unit_acc_k2m4n2(8, 9, %1, %%r12, 2) "addq $32,%1;" \ + unit_acc_k2m4n2(10,11,%%r15) unit_acc_k2m4n2(12,13,%%r15,%%r12,1) unit_acc_k2m4n2(14,15,%%r15,%%r12,2) "addq $32,%%r15;" \ + "subq $2, %5; cmpq $2, %5; jnb 204912b;"\ + unit_merge_to_ymm(4) unit_merge_to_ymm(5) unit_merge_to_ymm(6) unit_merge_to_ymm(7) \ + unit_merge_to_ymm(8) unit_merge_to_ymm(9) unit_merge_to_ymm(10) unit_merge_to_ymm(11) \ + unit_merge_to_ymm(12) unit_merge_to_ymm(13) unit_merge_to_ymm(14) unit_merge_to_ymm(15) \ + "testq %5, %5; jz 1004912f;"\ + "\n104912:"\ + KERNEL_h_k1m4n12 "addq $16,%%r15;"\ + "decq %5; jnz 104912b;"\ + "\n1004912:"\ + "incq %5;" + #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m4 "vmovddup (%0,%3,1),%%ymm1; vmovddup 8(%0,%3,1),%%ymm2; addq $32,%3;" #define acc_kend_nc2_k1m4(boff1) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1) @@ -336,7 +362,31 @@ #define KERNEL_h_k1m2n10 KERNEL_h_k1m2n8 unit_acc_m2n2(12,13,%%r15,%%r12,1) #define KERNEL_k1m2n10 KERNEL_h_k1m2n10 "addq $16,%%r15;" #define KERNEL_h_k1m2n12 KERNEL_h_k1m2n10 unit_acc_m2n2(14,15,%%r15,%%r12,2) -#define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;" +//#define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;" + +#define unit_acc_k4m2n2(c1_no,c2_no,...) \ + "vmovupd ("#__VA_ARGS__"),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";" + +#define unit_merge_to_xmm(c1_no) \ + "vextractf64x2 $0,%%zmm"#c1_no",%%xmm20; vextractf64x2 $1,%%zmm"#c1_no",%%xmm21; vextractf64x2 $2,%%zmm"#c1_no",%%xmm22; vextractf64x2 $3,%%zmm"#c1_no",%%xmm23;"\ + "vaddpd %%xmm20,%%xmm21,%%xmm20; vaddpd %%xmm22,%%xmm23,%%xmm22; vaddpd %%xmm20,%%xmm22,%%xmm"#c1_no";" + +#define KERNEL_k1m2n12 \ + "cmpq $4,%5; jb 102912f;"\ + "\n402912:"\ + "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; addq $64,%0;" \ + unit_acc_k4m2n2(4,5,%1) unit_acc_k4m2n2(6,7,%1,%%r12,1) unit_acc_k4m2n2(8,9,%1,%%r12,2) "addq $64,%1;" \ + unit_acc_k4m2n2(10,11,%%r15) unit_acc_k4m2n2(12,13,%%r15,%%r12,1) unit_acc_k4m2n2(14,15,%%r15,%%r12,2) "addq $64,%%r15;" \ + "subq $4,%5; cmpq $4,%5; jnb 402912b;"\ + unit_merge_to_xmm(4) unit_merge_to_xmm(5) unit_merge_to_xmm(6) unit_merge_to_xmm(7) unit_merge_to_xmm(8) unit_merge_to_xmm(9) \ + unit_merge_to_xmm(10) unit_merge_to_xmm(11) unit_merge_to_xmm(12) unit_merge_to_xmm(13) unit_merge_to_xmm(14) unit_merge_to_xmm(15) \ + "testq %5,%5; jz 1002912f;"\ + "\n102912:"\ + KERNEL_h_k1m2n12 "addq $16,%%r15;" \ + "decq %5; jnz 102912b;" \ + "\n1002912:"\ + "incq %5;" + #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m2 "vmovddup (%0,%3,1),%%xmm1; vmovddup 8(%0,%3,1),%%xmm2; addq $16,%3;" #define acc_kend_nc2_k1m2(boff1) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1) @@ -387,7 +437,24 @@ #define KERNEL_h_k1m1n10 KERNEL_h_k1m1n8 "vfmadd231pd (%%r15,%%r12,1),%%xmm1,%%xmm8;" #define KERNEL_k1m1n10 KERNEL_h_k1m1n10 "addq $16,%%r15;" #define KERNEL_h_k1m1n12 KERNEL_h_k1m1n10 "vfmadd231pd (%%r15,%%r12,2),%%xmm1,%%xmm9;" -#define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;" +//#define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;" +#define KERNEL_k1m1n12 \ + "cmpq $4,%5; jb 101912f;" \ + "vmovupd %11,%%zmm2;"\ + "\n401912:"\ + "vmovupd (%0),%%ymm1; vpermpd %%zmm1,%%zmm2,%%zmm1; addq $32,%0;" \ + "vfmadd231pd (%1),%%zmm1,%%zmm4; vfmadd231pd (%1,%%r12,1),%%zmm1,%%zmm5; vfmadd231pd (%1,%%r12,2),%%zmm1,%%zmm6; addq $64,%1;"\ + "vfmadd231pd (%%r15),%%zmm1,%%zmm7; vfmadd231pd (%%r15,%%r12,1),%%zmm1,%%zmm8; vfmadd231pd (%%r15,%%r12,2),%%zmm1,%%zmm9; addq $64,%%r15;"\ + "subq $4,%5; cmpq $4,%5; jnb 401912b;"\ + unit_merge_to_xmm(4) unit_merge_to_xmm(5) unit_merge_to_xmm(6) \ + unit_merge_to_xmm(7) unit_merge_to_xmm(8) unit_merge_to_xmm(9) \ + "testq %5,%5; jz 1001912f;"\ + "\n101912:"\ + KERNEL_h_k1m1n12 "addq $16,%%r15;" \ + "decq %5; jnz 101912b;" \ + "\n1001912:"\ + "incq %5;" + #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m1 "vmovddup (%0,%3,1),%%xmm1; addq $8,%3;" #define acc_kend_nc2_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;" @@ -480,7 +547,7 @@ COMPUTE_SIMPLE(1,ndim) "subq $1,%%r11;"\ #ndim"33106:\n\t"\ "movq %%r14,%1;"\ - :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(b_pref):"m"(M),"m"(ALPHA),"m"(off),"m"(K):"r10","r11","r12","r13","r14","r15","cc","memory",\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(b_pref):"m"(M),"m"(ALPHA),"m"(off),"m"(K), "o"(permute_table):"r10","r11","r12","r13","r14","r15","cc","memory",\ "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15",\ "zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31");\ a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ndim * ldc - M; TAIL_SET_OFF(ndim)\ @@ -501,6 +568,10 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, int64_t M = (int64_t)m, K = (int64_t)k, k_count = 0; BLASLONG n_count = n, off = 0; double *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*b_pref = B; + int64_t permute_table[] = { + 0, 0, 1, 1, 2, 2, 3, 3, // abcdxxxx -> aabbccdd + 0, 1, 0, 1, 2, 3, 2, 3, // abcdxxxx -> ababcdcd + }; #ifdef TRMMKERNEL #ifdef LEFT off = offset; diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 3eec21774..06de28d97 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -35,8 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_nehalem-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_n_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) #include "sgemv_n_microk_haswell-4.c" +#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#include "sgemv_n_microk_haswell-4.c" +#include "sgemv_n_microk_skylakex-8.c" #endif #if defined(STEAMROLLER) || defined(EXCAVATOR) @@ -291,6 +294,41 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + if ( m < 1 || n < 1) return(0); + + #ifdef HAVE_SGEMV_N_SKYLAKE_KERNEL + if (m <= 16384 && n <= 48 && !(n == 4)) + { + FLOAT * xbuffer_align = x; + FLOAT * ybuffer_align = y; + + FLOAT * xbuffer = NULL; + FLOAT * ybuffer = NULL; + + if (inc_x != 1) { + xbuffer_align = buffer; + for(BLASLONG i=0; i= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_SGEMV_N_SKYLAKE_KERNEL 1 +#include "common.h" +#include +static int sgemv_kernel_n_128(BLASLONG m, BLASLONG n, float alpha, float *a, BLASLONG lda, float *x, float *y) +{ + __m512 matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; + __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7; + __m512 xArray_0; + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); + BLASLONG tag_m_128x = m & (~127); + BLASLONG tag_m_64x = m & (~63); + BLASLONG tag_m_32x = m & (~31); + BLASLONG tag_m_16x = m & (~15); + + for (BLASLONG idx_m = 0; idx_m < tag_m_128x; idx_m+=128) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + accum512_2 = _mm512_setzero_ps(); + accum512_3 = _mm512_setzero_ps(); + accum512_4 = _mm512_setzero_ps(); + accum512_5 = _mm512_setzero_ps(); + accum512_6 = _mm512_setzero_ps(); + accum512_7 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + + matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 16]); + matrixArray_2 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 32]); + matrixArray_3 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 48]); + matrixArray_4 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 64]); + matrixArray_5 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 80]); + matrixArray_6 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 96]); + matrixArray_7 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 112]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + accum512_1 = _mm512_fmadd_ps(matrixArray_1, xArray_0, accum512_1); + accum512_2 = _mm512_fmadd_ps(matrixArray_2, xArray_0, accum512_2); + accum512_3 = _mm512_fmadd_ps(matrixArray_3, xArray_0, accum512_3); + accum512_4 = _mm512_fmadd_ps(matrixArray_4, xArray_0, accum512_4); + accum512_5 = _mm512_fmadd_ps(matrixArray_5, xArray_0, accum512_5); + accum512_6 = _mm512_fmadd_ps(matrixArray_6, xArray_0, accum512_6); + accum512_7 = _mm512_fmadd_ps(matrixArray_7, xArray_0, accum512_7); + } + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(accum512_1, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + _mm512_storeu_ps(&y[idx_m + 32], _mm512_fmadd_ps(accum512_2, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 32]))); + _mm512_storeu_ps(&y[idx_m + 48], _mm512_fmadd_ps(accum512_3, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 48]))); + _mm512_storeu_ps(&y[idx_m + 64], _mm512_fmadd_ps(accum512_4, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 64]))); + _mm512_storeu_ps(&y[idx_m + 80], _mm512_fmadd_ps(accum512_5, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 80]))); + _mm512_storeu_ps(&y[idx_m + 96], _mm512_fmadd_ps(accum512_6, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 96]))); + _mm512_storeu_ps(&y[idx_m + 112], _mm512_fmadd_ps(accum512_7, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 112]))); + } + if (tag_m_128x != m) { + for (BLASLONG idx_m = tag_m_128x; idx_m < tag_m_64x; idx_m+=64) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + accum512_2 = _mm512_setzero_ps(); + accum512_3 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + + matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 16]); + matrixArray_2 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 32]); + matrixArray_3 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 48]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + accum512_1 = _mm512_fmadd_ps(matrixArray_1, xArray_0, accum512_1); + accum512_2 = _mm512_fmadd_ps(matrixArray_2, xArray_0, accum512_2); + accum512_3 = _mm512_fmadd_ps(matrixArray_3, xArray_0, accum512_3); + } + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(accum512_1, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + _mm512_storeu_ps(&y[idx_m + 32], _mm512_fmadd_ps(accum512_2, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 32]))); + _mm512_storeu_ps(&y[idx_m + 48], _mm512_fmadd_ps(accum512_3, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 48]))); + } + + if(tag_m_64x != m) { + for (BLASLONG idx_m = tag_m_64x; idx_m < tag_m_32x; idx_m+=32) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + + matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 16]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + accum512_1 = _mm512_fmadd_ps(matrixArray_1, xArray_0, accum512_1); + } + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(accum512_1, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + } + + if(tag_m_32x != m) { + + for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) { + accum512_0 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + + matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + } + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + } + + if (tag_m_16x != m) { + accum512_0 = _mm512_setzero_ps(); + + unsigned short tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15))); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + + for(BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + matrixArray_0 = _mm512_maskz_loadu_ps(tail_mask, &a[idx_n * lda + tag_m_16x]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + } + + _mm512_mask_storeu_ps(&y[tag_m_16x], tail_mask, _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_maskz_loadu_ps(tail_mask, &y[tag_m_16x]))); + + } + } + } + } + return 0; +} + +static int sgemv_kernel_n_64(BLASLONG m, BLASLONG n, float alpha, float *a, BLASLONG lda, float *x, float *y) +{ + __m256 ma0, ma1, ma2, ma3, ma4, ma5, ma6, ma7; + __m256 as0, as1, as2, as3, as4, as5, as6, as7; + __m256 alphav = _mm256_set1_ps(alpha); + __m256 xv; + BLASLONG tag_m_32x = m & (~31); + BLASLONG tag_m_16x = m & (~15); + BLASLONG tag_m_8x = m & (~7); + __mmask8 one_mask = 0xff; + + for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) { + as0 = _mm256_setzero_ps(); + as1 = _mm256_setzero_ps(); + as2 = _mm256_setzero_ps(); + as3 = _mm256_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xv = _mm256_set1_ps(x[idx_n]); + ma0 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +0]); + ma1 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +8]); + ma2 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +16]); + ma3 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +24]); + + as0 = _mm256_maskz_fmadd_ps(one_mask, ma0, xv, as0); + as1 = _mm256_maskz_fmadd_ps(one_mask, ma1, xv, as1); + as2 = _mm256_maskz_fmadd_ps(one_mask, ma2, xv, as2); + as3 = _mm256_maskz_fmadd_ps(one_mask, ma3, xv, as3); + } + _mm256_mask_storeu_ps(&y[idx_m], one_mask, _mm256_maskz_fmadd_ps(one_mask, as0, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m]))); + _mm256_mask_storeu_ps(&y[idx_m + 8], one_mask, _mm256_maskz_fmadd_ps(one_mask, as1, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 8]))); + _mm256_mask_storeu_ps(&y[idx_m + 16], one_mask, _mm256_maskz_fmadd_ps(one_mask, as2, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 16]))); + _mm256_mask_storeu_ps(&y[idx_m + 24], one_mask, _mm256_maskz_fmadd_ps(one_mask, as3, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 24]))); + + } + + if (tag_m_32x != m ) { + for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) { + as4 = _mm256_setzero_ps(); + as5 = _mm256_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xv = _mm256_set1_ps(x[idx_n]); + ma4 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +0]); + ma5 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +8]); + + as4 = _mm256_maskz_fmadd_ps(one_mask, ma4, xv, as4); + as5 = _mm256_maskz_fmadd_ps(one_mask, ma5, xv, as5); + } + _mm256_mask_storeu_ps(&y[idx_m], one_mask, _mm256_maskz_fmadd_ps(one_mask, as4, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m]))); + _mm256_mask_storeu_ps(&y[idx_m + 8], one_mask, _mm256_maskz_fmadd_ps(one_mask, as5, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 8]))); + } + + if (tag_m_16x != m ) { + for (BLASLONG idx_m = tag_m_16x; idx_m < tag_m_8x; idx_m+=8) { + as6 = _mm256_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xv = _mm256_set1_ps(x[idx_n]); + ma6 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m]); + as6 = _mm256_maskz_fmadd_ps(one_mask, ma6, xv, as6); + } + _mm256_mask_storeu_ps(&y[idx_m], one_mask, _mm256_maskz_fmadd_ps(one_mask, as6, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m]))); + } + + if (tag_m_8x != m) { + as7 = _mm256_setzero_ps(); + + unsigned char tail_mask_uint = (((unsigned char)0xff) >> (8-(m&7))); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_uint); + + for(BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xv = _mm256_set1_ps(x[idx_n]); + ma7 = _mm256_maskz_loadu_ps(tail_mask, &a[idx_n * lda + tag_m_8x]); + + as7 = _mm256_maskz_fmadd_ps(tail_mask, ma7, xv, as7); + } + + _mm256_mask_storeu_ps(&y[tag_m_8x], tail_mask, _mm256_maskz_fmadd_ps(tail_mask, as7, alphav, _mm256_maskz_loadu_ps(tail_mask, &y[tag_m_8x]))); + + } + } + } + + return 0; +} + + +#endif \ No newline at end of file diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index fe886f57f..a36c8ace9 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -34,8 +34,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_bulldozer-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_t_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) #include "sgemv_t_microk_haswell-4.c" +#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#include "sgemv_t_microk_haswell-4.c" +#include "sgemv_t_microk_skylakex.c" #endif #if defined(STEAMROLLER) || defined(EXCAVATOR) @@ -305,6 +308,37 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( m < 1 ) return(0); if ( n < 1 ) return(0); + #ifdef HAVE_SGEMV_T_SKYLAKE_KERNEL + if (lda == m && n <= 16384 && m <= 8) + { + FLOAT * xbuffer_align = x; + FLOAT * ybuffer_align = y; + + if (inc_x != 1) { + xbuffer_align = buffer; + for(BLASLONG i=0; i= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_SGEMV_T_SKYLAKE_KERNEL 1 +#include "common.h" +#include +#include "sgemv_t_microk_skylakex_template.c" + +//sgemv_t: +// ----- m ----- +// |<----------- +// |<----------- +// n +// |<----------- +// |<----------- + +static int sgemv_kernel_t(BLASLONG m, BLASLONG n, float alpha, float *a, float *x, float *y) +{ + switch(m) { + case 1: sgemv_kernel_t_1(n, alpha, a, x, y); break; + case 2: sgemv_kernel_t_2(n, alpha, a, x, y); break; + case 3: sgemv_kernel_t_3(n, alpha, a, x, y); break; + case 4: sgemv_kernel_t_4(n, alpha, a, x, y); break; + case 5: sgemv_kernel_t_5(n, alpha, a, x, y); break; + case 6: sgemv_kernel_t_6(n, alpha, a, x, y); break; + case 7: sgemv_kernel_t_7(n, alpha, a, x, y); break; + case 8: sgemv_kernel_t_8(n, alpha, a, x, y); break; + default: break; + } + return 0; +} + +#endif diff --git a/kernel/x86_64/sgemv_t_microk_skylakex_template.c b/kernel/x86_64/sgemv_t_microk_skylakex_template.c new file mode 100644 index 000000000..34415054c --- /dev/null +++ b/kernel/x86_64/sgemv_t_microk_skylakex_template.c @@ -0,0 +1,1120 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include +#include "common.h" + +//Here the m means n in sgemv_t: +// ----- n ----- +// | +// | +// m +// | +// | +static int sgemv_kernel_t_1(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + //printf("enter into t_1 kernel\n"); + //printf("m = %ld\n", m); + __m512 matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; + float alphaX = alpha * (*x); + __m512 ALPHAXVECTOR = _mm512_set1_ps(alphaX); + + BLASLONG tag_m_128x = m & (~127); + BLASLONG tag_m_64x = m & (~63); + BLASLONG tag_m_32x = m & (~31); + BLASLONG tag_m_16x = m & (~15); + + for (BLASLONG idx_m = 0; idx_m < tag_m_128x; idx_m+=128) { + matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_m + 16]); + matrixArray_2 = _mm512_loadu_ps(&a[idx_m + 32]); + matrixArray_3 = _mm512_loadu_ps(&a[idx_m + 48]); + matrixArray_4 = _mm512_loadu_ps(&a[idx_m + 64]); + matrixArray_5 = _mm512_loadu_ps(&a[idx_m + 80]); + matrixArray_6 = _mm512_loadu_ps(&a[idx_m + 96]); + matrixArray_7 = _mm512_loadu_ps(&a[idx_m + 112]); + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(matrixArray_1, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + _mm512_storeu_ps(&y[idx_m + 32], _mm512_fmadd_ps(matrixArray_2, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 32]))); + _mm512_storeu_ps(&y[idx_m + 48], _mm512_fmadd_ps(matrixArray_3, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 48]))); + _mm512_storeu_ps(&y[idx_m + 64], _mm512_fmadd_ps(matrixArray_4, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 64]))); + _mm512_storeu_ps(&y[idx_m + 80], _mm512_fmadd_ps(matrixArray_5, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 80]))); + _mm512_storeu_ps(&y[idx_m + 96], _mm512_fmadd_ps(matrixArray_6, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 96]))); + _mm512_storeu_ps(&y[idx_m + 112], _mm512_fmadd_ps(matrixArray_7, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 112]))); + + } + + if (tag_m_128x != m) { + for (BLASLONG idx_m = tag_m_128x; idx_m < tag_m_64x; idx_m+=64) { + matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_m + 16]); + matrixArray_2 = _mm512_loadu_ps(&a[idx_m + 32]); + matrixArray_3 = _mm512_loadu_ps(&a[idx_m + 48]); + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(matrixArray_1, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + _mm512_storeu_ps(&y[idx_m + 32], _mm512_fmadd_ps(matrixArray_2, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 32]))); + _mm512_storeu_ps(&y[idx_m + 48], _mm512_fmadd_ps(matrixArray_3, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 48]))); + + } + + if (tag_m_64x != m) { + for (BLASLONG idx_m = tag_m_64x; idx_m < tag_m_32x; idx_m+=32) { + matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_m + 16]); + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(matrixArray_1, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + + } + + if (tag_m_32x != m) { + for (BLASLONG idx_m = tag_m_64x; idx_m < tag_m_16x; idx_m+=32) { + matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]); + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + } + + if (tag_m_16x != m) { + unsigned short tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15))); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + matrixArray_0 = _mm512_maskz_loadu_ps(tail_mask, &a[tag_m_16x]); + + _mm512_mask_storeu_ps(&y[tag_m_16x], tail_mask, _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_maskz_loadu_ps(tail_mask, &y[tag_m_16x]))); + + } + + + } + } + } + + return 0; +} + +static int sgemv_kernel_t_2(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + __m512 m0, m1, m2, m3, col0_1, col0_2, col1_1, col1_2, x1Array, x2Array; + float x1a = x[0] * alpha; + float x2a = x[1] * alpha; + x1Array = _mm512_set1_ps(x1a); + x2Array = _mm512_set1_ps(x2a); + BLASLONG tag_m_32x = m & (~31); + BLASLONG tag_m_16x = m & (~15); + BLASLONG tag_m_8x = m & (~7); + __m512i M512_EPI32_1 = _mm512_set1_epi32(1); + __m512i idx_base_0 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_1); + + for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) { + m0 = _mm512_loadu_ps(&a[idx_m*2]); + m1 = _mm512_loadu_ps(&a[idx_m*2 + 16]); + m2 = _mm512_loadu_ps(&a[idx_m*2 + 32]); + m3 = _mm512_loadu_ps(&a[idx_m*2 + 48]); + col0_1 = _mm512_permutex2var_ps(m0, idx_base_0, m1); + col0_2 = _mm512_permutex2var_ps(m0, idx_base_1, m1); + col1_1 = _mm512_permutex2var_ps(m2, idx_base_0, m3); + col1_2 = _mm512_permutex2var_ps(m2, idx_base_1, m3); + + _mm512_storeu_ps(&y[idx_m], _mm512_add_ps(_mm512_fmadd_ps(x2Array, col0_2, _mm512_mul_ps(col0_1, x1Array)), _mm512_loadu_ps(&y[idx_m]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_add_ps(_mm512_fmadd_ps(x2Array, col1_2, _mm512_mul_ps(col1_1, x1Array)), _mm512_loadu_ps(&y[idx_m + 16]))); + } + if (tag_m_32x != m) { + for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) { + m0 = _mm512_loadu_ps(&a[idx_m]); + m1 = _mm512_loadu_ps(&a[idx_m + 16]); + col1_1 = _mm512_permutex2var_ps(m0, idx_base_0, m1); + col1_2 = _mm512_permutex2var_ps(m0, idx_base_1, m1); + _mm512_storeu_ps(&y[idx_m], _mm512_add_ps(_mm512_fmadd_ps(x2Array, col1_2, _mm512_mul_ps(col1_1, x1Array)), _mm512_loadu_ps(&y[idx_m]))); + } + if (tag_m_16x != m) { + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); + unsigned char load_mask_value = (((unsigned char)0xff) >> 6); + __mmask8 load_mask = *((__mmask8*) &load_mask_value); + x1Array = _mm512_broadcast_f32x2(_mm_maskz_loadu_ps(load_mask, x)); + for (BLASLONG idx_m = tag_m_16x; idx_m < tag_m_8x; idx_m+=8) { + m0 = _mm512_loadu_ps(&a[idx_m]); + m1 = _mm512_mul_ps(_mm512_mul_ps(m0, x1Array), ALPHAVECTOR); + m2 = _mm512_permutexvar_ps(_mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0), m1); + __m256 ret = _mm256_add_ps(_mm512_extractf32x8_ps(m2, 1), _mm512_extractf32x8_ps(m2, 0)); + _mm256_storeu_ps(&y[idx_m], _mm256_add_ps(ret, _mm256_loadu_ps(&y[idx_m]))); + + } + + if (tag_m_8x != m) { + unsigned short tail_mask_value = (((unsigned int)0xffff) >> (16-((m-tag_m_8x)*2)&15)); + __mmask16 a_mask = *((__mmask16*) &tail_mask_value); + unsigned char y_mask_value = (((unsigned char)0xff) >> (8-(m-tag_m_8x))); + __mmask8 y_mask = *((__mmask8*) &y_mask_value); + + m0 = _mm512_maskz_loadu_ps(a_mask, &a[tag_m_8x]); + m1 = _mm512_mul_ps(_mm512_mul_ps(m0, x1Array), ALPHAVECTOR); + m2 = _mm512_permutexvar_ps(_mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0), m1); + __m256 ret = _mm256_add_ps(_mm512_extractf32x8_ps(m2, 1), _mm512_extractf32x8_ps(m2, 0)); + _mm256_mask_storeu_ps(&y[tag_m_8x], y_mask, _mm256_add_ps(ret, _mm256_maskz_loadu_ps(y_mask, &y[tag_m_8x]))); + } + } + } + return 0; +} + +static int sgemv_kernel_t_3(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + __m512 m0, m1, m2, c1, c2, c3, tmp, x1Array, x2Array, x3Array; + float x1a = x[0] * alpha; + float x2a = x[1] * alpha; + float x3a = x[2] * alpha; + x1Array = _mm512_set1_ps(x1a); + x2Array = _mm512_set1_ps(x2a); + x3Array = _mm512_set1_ps(x3a); + BLASLONG tag_m_16x = m & (~15); + BLASLONG tag_m_8x = m & (~7); + BLASLONG tag_m_4x = m & (~3); + BLASLONG tag_m_2x = m & (~1); + + __m512i M512_EPI32_1 = _mm512_set1_epi32(1); + __m512i M512_EPI32_s1 = _mm512_set1_epi32(-1); + __m512i idx_c1_1 = _mm512_set_epi32(0, 0, 0, 0, 0, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0); + __m512i idx_c2_1 = _mm512_add_epi32(idx_c1_1, M512_EPI32_1); + __m512i idx_c3_1 = _mm512_add_epi32(idx_c2_1, M512_EPI32_1); + + __m512i idx_c3_2 = _mm512_set_epi32(31, 28, 25, 22, 19, 16, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __m512i idx_c2_2 = _mm512_add_epi32(idx_c3_2, M512_EPI32_s1); + __m512i idx_c1_2 = _mm512_add_epi32(idx_c2_2, M512_EPI32_s1); + + __mmask16 step_1 = 0x07ff; + __mmask16 step_2 = 0xf800; + __mmask16 c31 = 0x03ff; + + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + m0 = _mm512_loadu_ps(&a[idx_m*3]); + m1 = _mm512_loadu_ps(&a[idx_m*3 + 16]); + m2 = _mm512_loadu_ps(&a[idx_m*3 + 32]); + + tmp = _mm512_mask_permutex2var_ps(m0, step_1, idx_c1_1, m1); + c1 = _mm512_mask_permutex2var_ps(tmp, step_2, idx_c1_2, m2); + tmp = _mm512_mask_permutex2var_ps(m0, step_1, idx_c2_1, m1); + c2 = _mm512_mask_permutex2var_ps(tmp, step_2, idx_c2_2, m2); + tmp = _mm512_mask_permutex2var_ps(m0, c31, idx_c3_1, m1); + c3 = _mm512_permutex2var_ps(tmp, idx_c3_2, m2); + + tmp = _mm512_fmadd_ps(x2Array, c2, _mm512_mul_ps(c1, x1Array)); + _mm512_storeu_ps(&y[idx_m], _mm512_add_ps(_mm512_fmadd_ps(x3Array, c3, tmp), _mm512_loadu_ps(&y[idx_m]))); + } + + if(tag_m_16x != m) { + __mmask8 a_mask = 0xff; + __m256i M256_EPI32_1 = _mm256_maskz_set1_epi32(a_mask, 1); + __m256i M256_EPI32_s1 = _mm256_maskz_set1_epi32(a_mask, -1); + __m256i idx_c1_1 = _mm256_set_epi32(0, 0, 15, 12, 9, 6, 3, 0); + __m256i idx_c2_1 = _mm256_add_epi32(idx_c1_1, M256_EPI32_1); + __m256i idx_c3_1 = _mm256_add_epi32(idx_c2_1, M256_EPI32_1); + + __m256i idx_c3_2 = _mm256_set_epi32(15, 12, 9, 0, 0, 0, 0, 0); + __m256i idx_c2_2 = _mm256_add_epi32(idx_c3_2, M256_EPI32_s1); + __m256i idx_c1_2 = _mm256_add_epi32(idx_c2_2, M256_EPI32_s1); + + __mmask8 step_1 = 0x1f; + __mmask8 step_2 = 0xe0; + __mmask8 c12 = 0xc0; + + __m256 m256_0, m256_1, m256_2, tmp256, c256_1, c256_2, c256_3, x256_1, x256_2, x256_3; + x256_1 = _mm256_set1_ps(x1a); + x256_2 = _mm256_set1_ps(x2a); + x256_3 = _mm256_set1_ps(x3a); + + for (BLASLONG idx_m = tag_m_16x; idx_m < tag_m_8x; idx_m+=8) { + m256_0 = _mm256_loadu_ps(&a[idx_m*3]); + m256_1 = _mm256_loadu_ps(&a[idx_m*3 + 8]); + m256_2 = _mm256_loadu_ps(&a[idx_m*3 + 16]); + + tmp256 = _mm256_permutex2var_ps(m256_0, idx_c1_1, m256_1); + c256_1 = _mm256_mask_permutex2var_ps(tmp256, c12, idx_c1_2, m256_2); + tmp256 = _mm256_mask_permutex2var_ps(m256_0, step_1, idx_c2_1, m256_1); + c256_2 = _mm256_mask_permutex2var_ps(tmp256, step_2, idx_c2_2, m256_2); + tmp256 = _mm256_mask_permutex2var_ps(m256_0, step_1, idx_c3_1, m256_1); + c256_3 = _mm256_mask_permutex2var_ps(tmp256, step_2, idx_c3_2, m256_2); + + tmp256 = _mm256_fmadd_ps(x256_2, c256_2, _mm256_mul_ps(c256_1, x256_1)); + _mm256_storeu_ps(&y[idx_m], _mm256_maskz_add_ps(a_mask, _mm256_fmadd_ps(x256_3, c256_3, tmp256), _mm256_loadu_ps(&y[idx_m]))); + } + + if(tag_m_8x != m){ + for (BLASLONG idx_m = tag_m_8x; idx_m < tag_m_4x; idx_m+=4){ + m0 = _mm512_maskz_loadu_ps(0x0fff, &a[tag_m_8x*3]); + m256_0 = _mm512_extractf32x8_ps(m0, 0); + m256_1 = _mm512_extractf32x8_ps(m0, 1); + __m256i idx1 = _mm256_set_epi32(10, 7, 4, 1, 9, 6, 3, 0); + __m256i M256_EPI32_2 = _mm256_maskz_set1_epi32(0x0f, 2); + __m256i idx2 = _mm256_add_epi32(idx1, M256_EPI32_2); + + c256_1 = _mm256_mask_permutex2var_ps(m256_0, 0xff, idx1, m256_1); + c256_2 = _mm256_mask_permutex2var_ps(m256_0, 0x0f, idx2, m256_1); + + __m128 c128_1 = _mm256_extractf32x4_ps(c256_1, 0); + __m128 c128_2 = _mm256_extractf32x4_ps(c256_1, 1); + __m128 c128_3 = _mm256_extractf32x4_ps(c256_2, 0); + + __m128 x128_1 = _mm_set1_ps(x1a); + __m128 x128_2 = _mm_set1_ps(x2a); + __m128 x128_3 = _mm_set1_ps(x3a); + + __m128 tmp128 = _mm_maskz_fmadd_ps(0x0f, c128_1, x128_1, _mm_maskz_mul_ps(0x0f, c128_2, x128_2)); + _mm_mask_storeu_ps(&y[idx_m], 0x0f, _mm_maskz_add_ps(0x0f, _mm_maskz_fmadd_ps(0x0f, c128_3, x128_3, tmp128), _mm_maskz_loadu_ps(0x0f, &y[idx_m]))); + } + + if(tag_m_4x != m) { + for (BLASLONG idx_m = tag_m_4x; idx_m < tag_m_2x; idx_m+=2) { + m256_0 = _mm256_maskz_loadu_ps(0x3f, &a[idx_m*3]); + __m128 a128_1 = _mm256_extractf32x4_ps(m256_0, 0); + __m128 a128_2 = _mm256_extractf32x4_ps(m256_0, 1); + __m128 x128 = _mm_maskz_loadu_ps(0x07, x); + + __m128i idx128_1= _mm_set_epi32(0, 2, 1, 0); + __m128i M128_EPI32_3 = _mm_maskz_set1_epi32(0x07, 3); + __m128i idx128_2 = _mm_add_epi32(idx128_1, M128_EPI32_3); + + __m128 c128_1 = _mm_maskz_permutex2var_ps(0x07, a128_1, idx128_1, a128_2); + __m128 c128_2 = _mm_maskz_permutex2var_ps(0x07, a128_1, idx128_2, a128_2); + + __m128 tmp128 = _mm_hadd_ps(_mm_maskz_mul_ps(0x07, c128_1, x128), _mm_maskz_mul_ps(0x07, c128_2, x128)); + float ret[4]; + _mm_mask_storeu_ps(ret, 0x0f, tmp128); + y[idx_m] += alpha *(ret[0] + ret[1]); + y[idx_m+1] += alpha * (ret[2] + ret[3]); + } + + if(tag_m_2x != m) { + y[tag_m_2x] += alpha*(a[tag_m_2x*3]*x[0] + a[tag_m_2x*3+1]*x[1] + a[tag_m_2x*3+2]*x[2]); + } + } + } + } + + return 0; +} + +static int sgemv_kernel_t_4(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + BLASLONG tag_m_4x = m & (~3); + BLASLONG tag_m_2x = m & (~1); + __m512 m0, m1, m2; + __m256 m256_0, m256_1, c256_1, c256_2; + __m128 c1, c2, c3, c4, ret; + __m128 xarray = _mm_maskz_loadu_ps(0x0f, x); + __m512 x512 = _mm512_broadcast_f32x4(xarray); + __m512 alphavector = _mm512_set1_ps(alpha); + __m512 xa512 = _mm512_mul_ps(x512, alphavector); + __m256i idx1 = _mm256_set_epi32(13, 9, 5, 1, 12, 8, 4, 0); + __m256i idx2 = _mm256_set_epi32(15, 11, 7, 3, 14, 10, 6, 2); + + + for (BLASLONG idx_m = 0; idx_m < tag_m_4x; idx_m+=4) { + m0 = _mm512_loadu_ps(&a[idx_m*4]); + m1 = _mm512_mul_ps(m0, xa512); + m256_0 = _mm512_extractf32x8_ps(m1, 0); + m256_1 = _mm512_extractf32x8_ps(m1, 1); + c256_1 = _mm256_mask_permutex2var_ps(m256_0, 0xff, idx1, m256_1); + c256_2 = _mm256_mask_permutex2var_ps(m256_0, 0xff, idx2, m256_1); + + c1 = _mm256_extractf32x4_ps(c256_1, 0); + c2 = _mm256_extractf32x4_ps(c256_1, 1); + c3 = _mm256_extractf32x4_ps(c256_2, 0); + c4 = _mm256_extractf32x4_ps(c256_2, 1); + + ret = _mm_maskz_add_ps(0xff, _mm_maskz_add_ps(0xff, _mm_maskz_add_ps(0xff, c1, c2), _mm_maskz_add_ps(0xff, c3, c4)), _mm_maskz_loadu_ps(0xff, y)); + _mm_mask_storeu_ps(&y[idx_m], 0xff, ret); + } + + if(tag_m_4x != m) { + float result[4]; + for(BLASLONG idx_m=tag_m_4x; idx_m < tag_m_2x; idx_m+=2) { + m256_0 = _mm256_maskz_loadu_ps(0xff, &a[idx_m*4]); + c1 = _mm256_maskz_extractf32x4_ps(0xff, m256_0, 0); + c2 = _mm256_maskz_extractf32x4_ps(0xff, m256_0, 1); + + c3 = _mm_maskz_mul_ps(0x0f, c1, xarray); + c4 = _mm_maskz_mul_ps(0x0f, c2, xarray); + + ret = _mm_hadd_ps(c3, c4); + _mm_mask_storeu_ps(result, 0x0f, ret); + y[idx_m] += alpha *(result[0] + result[1]); + y[idx_m+1] += alpha * (result[2] + result[3]); + } + + if(tag_m_2x != m ) { + c1 = _mm_maskz_loadu_ps(0x0f, &a[tag_m_2x * 4]); + c2 = _mm_maskz_mul_ps(0x0f, c1, xarray); + _mm_mask_storeu_ps(result, 0x0f, c2); + y[tag_m_2x] += alpha *(result[0] + result[1] + result[2] + result[3]); + } + } + + return 0; +} + +static int sgemv_kernel_t_5(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + BLASLONG tag_m_16x = m & (~15); + BLASLONG tag_m_8x = m & (~7); + BLASLONG tag_m_4x = m & (~3); + BLASLONG tag_m_2x = m & (~1); + __m512 m0, m1, m2, m3, m4, tmp0, tmp1, tmp2, accum, c0, c1, c2, c3, c4; + __m512 x0_512 = _mm512_set1_ps(x[0]); + __m512 x1_512 = _mm512_set1_ps(x[1]); + __m512 x2_512 = _mm512_set1_ps(x[2]); + __m512 x3_512 = _mm512_set1_ps(x[3]); + __m512 x4_512 = _mm512_set1_ps(x[4]); + __m512 alpha_512 = _mm512_set1_ps(alpha); + + + __m512i M512_EPI32_1 = _mm512_set1_epi32(1); + __m512i M512_EPI32_16 = _mm512_set1_epi32(16); + __m512i M512_EPI32_0 = _mm512_setzero_epi32(); + + __m512i idx_c0 = _mm512_set_epi32(27, 22, 17, 28, 23, 18, 13, 8, 3, 30, 25, 20, 15, 10, 5, 0); + __m512i idx_c1 = _mm512_add_epi32(idx_c0, M512_EPI32_1); + __m512i idx_c2 = _mm512_add_epi32(idx_c1, M512_EPI32_1); + idx_c2 = _mm512_mask_blend_epi32(0x0040, idx_c2, M512_EPI32_0); + __m512i idx_c3 = _mm512_add_epi32(idx_c2, M512_EPI32_1); + __m512i idx_c4 = _mm512_add_epi32(idx_c3, M512_EPI32_1); + idx_c4 = _mm512_mask_blend_epi32(0x1000, idx_c4, M512_EPI32_16); + + for (BLASLONG idx_m=0; idx_m < tag_m_16x; idx_m+=16) { + m0 = _mm512_loadu_ps(&a[idx_m*5]); + m1 = _mm512_loadu_ps(&a[idx_m*5 + 16]); + m2 = _mm512_loadu_ps(&a[idx_m*5 + 32]); + m3 = _mm512_loadu_ps(&a[idx_m*5 + 48]); + m4 = _mm512_loadu_ps(&a[idx_m*5 + 64]); + + tmp0 = _mm512_maskz_permutex2var_ps(0x007f, m0, idx_c0, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x1f80, m2, idx_c0, m3); + c0 = _mm512_mask_blend_ps(0x1f80, tmp0, tmp1); + c0 = _mm512_mask_permutex2var_ps(c0, 0xe000, idx_c0, m4); + + tmp0 = _mm512_maskz_permutex2var_ps(0x007f, m0, idx_c1, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x1f80, m2, idx_c1, m3); + c1 = _mm512_mask_blend_ps(0x1f80, tmp0, tmp1); + c1 = _mm512_mask_permutex2var_ps(c1, 0xe000, idx_c1, m4); + + tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c2, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x1fc0, m2, idx_c2, m3); + c2 = _mm512_mask_blend_ps(0x1fc0, tmp0, tmp1); + c2 = _mm512_mask_permutex2var_ps(c2, 0xe000, idx_c2, m4); + + tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c3, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x1fc0, m2, idx_c3, m3); + c3 = _mm512_mask_blend_ps(0x1fc0, tmp0, tmp1); + c3 = _mm512_mask_permutex2var_ps(c3, 0xe000, idx_c3, m4); + + tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c4, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x0fc0, m2, idx_c4, m3); + c4 = _mm512_mask_blend_ps(0x0fc0, tmp0, tmp1); + c4 = _mm512_mask_permutex2var_ps(c4, 0xf000, idx_c4, m4); + + accum = _mm512_fmadd_ps(c1, x1_512, _mm512_mul_ps(c0, x0_512)); + accum = _mm512_fmadd_ps(c2, x2_512, accum); + accum = _mm512_fmadd_ps(c3, x3_512, accum); + accum = _mm512_fmadd_ps(c4, x4_512, accum); + accum = _mm512_fmadd_ps(accum, alpha_512, _mm512_loadu_ps(&y[idx_m])); + _mm512_storeu_ps(&y[idx_m], accum); + + } + if(tag_m_16x !=m) { + __m512i idx_c0c2 = _mm512_set_epi32(0, 0, 27, 22, 17, 12, 7, 2 , 0, 30, 25, 20, 15, 10, 5, 0); + __m512i idx_c1c3 = _mm512_add_epi32(idx_c0c2, M512_EPI32_1); + idx_c4 = _mm512_add_epi32(idx_c1c3, M512_EPI32_1); + __m256i idx_c0m4 = _mm256_set_epi32(11, 6, 0, 0, 0, 0, 0, 0); + __m256i M256_EPI32_1 = _mm256_set1_epi32(1); + __m256i idx_c1m4 = _mm256_add_epi32(idx_c0m4, M256_EPI32_1); + __m256i idx_c2m4 = _mm256_add_epi32(idx_c1m4, M256_EPI32_1); + __m256i idx_c3m4 = _mm256_add_epi32(idx_c2m4, M256_EPI32_1); + __m256i idx_c4m4 = _mm256_add_epi32(idx_c3m4, M256_EPI32_1); + //TODO: below can change to use extract to decrease the latency + __m256 x0_256 = _mm256_set1_ps(x[0]); + __m256 x1_256 = _mm256_set1_ps(x[1]); + __m256 x2_256 = _mm256_set1_ps(x[2]); + __m256 x3_256 = _mm256_set1_ps(x[3]); + __m256 x4_256 = _mm256_set1_ps(x[4]); + __m256 alpha256 = _mm256_set1_ps(alpha); + __m256 accum_256, m256_4; + + for(BLASLONG idx_m=tag_m_16x; idx_m < tag_m_8x; idx_m+=8) { + m0 = _mm512_loadu_ps(&a[idx_m*5]); + m1 = _mm512_loadu_ps(&a[idx_m*5 + 16]); + m256_4 = _mm256_loadu_ps(&a[idx_m*5 + 32]); + tmp0 = _mm512_permutex2var_ps(m0, idx_c0c2, m1); + tmp1 = _mm512_permutex2var_ps(m0, idx_c1c3, m1); + tmp2 = _mm512_permutex2var_ps(m0, idx_c4, m1); + + __m256 c256_0 = _mm512_extractf32x8_ps(tmp0, 0); + __m256 c256_2 = _mm512_extractf32x8_ps(tmp0, 1); + __m256 c256_1 = _mm512_extractf32x8_ps(tmp1, 0); + __m256 c256_3 = _mm512_extractf32x8_ps(tmp1, 1); + __m256 c256_4 = _mm512_extractf32x8_ps(tmp2, 1); + + c256_0 = _mm256_mask_permutex2var_ps(c256_0, 0x80, idx_c0m4, m256_4); + c256_1 = _mm256_mask_permutex2var_ps(c256_1, 0x80, idx_c1m4, m256_4); + c256_2 = _mm256_mask_permutex2var_ps(c256_2, 0xc0, idx_c2m4, m256_4); + c256_3 = _mm256_mask_permutex2var_ps(c256_3, 0xc0, idx_c3m4, m256_4); + c256_4 = _mm256_mask_permutex2var_ps(c256_4, 0xc0, idx_c4m4, m256_4); + + accum_256 = _mm256_fmadd_ps(c256_1, x1_256, _mm256_mul_ps(c256_0, x0_256)); + accum_256 = _mm256_fmadd_ps(c256_2, x2_256, accum_256); + accum_256 = _mm256_fmadd_ps(c256_3, x3_256, accum_256); + accum_256 = _mm256_fmadd_ps(c256_4, x4_256, accum_256); + accum_256 = _mm256_fmadd_ps(accum_256, alpha256, _mm256_loadu_ps(&y[idx_m])); + _mm256_storeu_ps(&y[idx_m], accum_256); + } + if(tag_m_8x != m) { + __m256i idx_c02 = _mm256_set_epi32(17, 12, 7, 2, 15, 10, 5, 0); + __m256i idx_c13 = _mm256_add_epi32(idx_c02, M256_EPI32_1); + __m256i idx_4 = _mm256_add_epi32(idx_c13, M256_EPI32_1); + __m128 accum_128; + __m256 m256_0, m256_1, tmp256_0, tmp256_1; + for (BLASLONG idx_m = tag_m_8x; idx_m < tag_m_4x; idx_m+=4){ + m256_0 = _mm256_loadu_ps(&a[idx_m*5]); + m256_1 = _mm256_loadu_ps(&a[idx_m*5 + 8]); + __m128 m128_4 = _mm_maskz_loadu_ps(0x0f, &a[idx_m*5 + 16]); + + tmp256_0 = _mm256_permutex2var_ps(m256_0, idx_c02, m256_1); + tmp256_1 = _mm256_permutex2var_ps(m256_0, idx_c13, m256_1); + __m256 tmp256_2 = _mm256_maskz_permutex2var_ps(0xf0, m256_0, idx_4, m256_1); + + __m128 c128_0 = _mm256_extractf32x4_ps(tmp256_0, 0); + __m128 c128_1 = _mm256_extractf32x4_ps(tmp256_1, 0); + __m128 c128_2 = _mm256_extractf32x4_ps(tmp256_0, 1); + __m128 c128_3 = _mm256_extractf32x4_ps(tmp256_1, 1); + __m128 c128_4 = _mm256_extractf32x4_ps(tmp256_2, 1); + + __m128i idx_c14 = _mm_set_epi32(4, 0, 0, 0); + __m128i M128_EPI32_1 = _mm_set1_epi32(1); + __m128i idx_c24 = _mm_add_epi32(idx_c14, M128_EPI32_1); + __m128i idx_c34 = _mm_add_epi32(idx_c24, M128_EPI32_1); + __m128i idx_c44 = _mm_add_epi32(idx_c34, M128_EPI32_1); + + c128_1 = _mm_mask_permutex2var_ps(c128_1, 0x08, idx_c14, m128_4); + c128_2 = _mm_mask_permutex2var_ps(c128_2, 0x08, idx_c24, m128_4); + c128_3 = _mm_mask_permutex2var_ps(c128_3, 0x08, idx_c34, m128_4); + c128_4 = _mm_mask_permutex2var_ps(c128_4, 0x08, idx_c44, m128_4); + + __m128 x128_0 = _mm256_extractf32x4_ps(x0_256, 0); + __m128 x128_1 = _mm256_extractf32x4_ps(x1_256, 0); + __m128 x128_2 = _mm256_extractf32x4_ps(x2_256, 0); + __m128 x128_3 = _mm256_extractf32x4_ps(x3_256, 0); + __m128 x128_4 = _mm256_extractf32x4_ps(x4_256, 0); + + __m128 alpha_128 = _mm256_extractf32x4_ps(alpha256, 0); + accum_128 = _mm_maskz_fmadd_ps(0x0f, c128_1, x128_1, _mm_maskz_mul_ps(0x0f, c128_0, x128_0)); + accum_128 = _mm_maskz_fmadd_ps(0x0f, c128_2, x128_2, accum_128); + accum_128 = _mm_maskz_fmadd_ps(0x0f, c128_3, x128_3, accum_128); + accum_128 = _mm_maskz_fmadd_ps(0x0f, c128_4, x128_4, accum_128); + accum_128 = _mm_maskz_fmadd_ps(0x0f, accum_128, alpha_128, _mm_maskz_loadu_ps(0x0f, &y[idx_m])); + _mm_mask_storeu_ps(&y[idx_m], 0x0f, accum_128); + + } + + if(tag_m_4x !=m ){ + x0_256 = _mm256_maskz_loadu_ps(0x1f, x); + x0_256 = _mm256_mul_ps(x0_256, alpha256); + float ret8[8]; + + for(BLASLONG idx_m = tag_m_4x; idx_m < tag_m_2x; idx_m+=2){ + m256_0 = _mm256_maskz_loadu_ps(0x1f, &a[idx_m*5]); + m256_1 = _mm256_maskz_loadu_ps(0x1f, &a[idx_m*5 + 5]); + + m256_0 = _mm256_mul_ps(m256_0, x0_256); + m256_1 = _mm256_mul_ps(m256_1, x0_256); + + _mm256_mask_storeu_ps(ret8, 0x1f, m256_0); + y[idx_m] += ret8[0] + ret8[1] + ret8[2] + ret8[3] + ret8[4]; + _mm256_mask_storeu_ps(ret8, 0x1f, m256_1); + y[idx_m+1] += ret8[0] + ret8[1] + ret8[2] + ret8[3] + ret8[4]; + + } + + if(tag_m_2x != m){ + m256_0 = _mm256_maskz_loadu_ps(0x1f, &a[tag_m_2x*5]); + m256_0 = _mm256_mul_ps(m256_0, x0_256); + + + _mm256_mask_storeu_ps(ret8, 0x1f, m256_0); + y[tag_m_2x] += ret8[0] + ret8[1] + ret8[2] + ret8[3] + ret8[4]; + + } + } + } + + } + return 0; +} + +static int sgemv_kernel_t_6(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + BLASLONG tag_m_16x = m & (~15); + BLASLONG tag_m_8x = m & (~7); + BLASLONG tag_m_4x = m & (~3); + BLASLONG tag_m_2x = m & (~1); + + __m512 m0, m1, m2, m3, m4, m5, c0, c1, c2, c3, c4, c5, tmp0, tmp1, tmp2, accum; + __m512i idx_c0 = _mm512_set_epi32(26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0); + __m512i M512_EPI32_1 = _mm512_set1_epi32(1); + __m512i M512_EPI32_0 = _mm512_setzero_epi32(); + __m512i M512_EPI32_16 = _mm512_set1_epi32(16); + __m512i idx_c1 = _mm512_add_epi32(idx_c0, M512_EPI32_1); + __m512i idx_c2 = _mm512_add_epi32(idx_c1, M512_EPI32_1); + idx_c2 = _mm512_mask_blend_epi32(0x0020, idx_c2, M512_EPI32_0); + __m512i idx_c3 = _mm512_add_epi32(idx_c2, M512_EPI32_1); + __m512i idx_c4 = _mm512_add_epi32(idx_c3, M512_EPI32_1); + idx_c4 = _mm512_mask_blend_epi32(0x0400, idx_c4, M512_EPI32_0); + __m512i idx_c5 = _mm512_add_epi32(idx_c4, M512_EPI32_1); + + __m512 x0_512 = _mm512_set1_ps(x[0]); + __m512 x1_512 = _mm512_set1_ps(x[1]); + __m512 x2_512 = _mm512_set1_ps(x[2]); + __m512 x3_512 = _mm512_set1_ps(x[3]); + __m512 x4_512 = _mm512_set1_ps(x[4]); + __m512 x5_512 = _mm512_set1_ps(x[5]); + __m512 alpha_512 = _mm512_set1_ps(alpha); + + for (BLASLONG idx_m=0; idx_m < tag_m_16x; idx_m+=16) { + m0 = _mm512_loadu_ps(&a[idx_m*6]); + m1 = _mm512_loadu_ps(&a[idx_m*6 + 16]); + m2 = _mm512_loadu_ps(&a[idx_m*6 + 32]); + m3 = _mm512_loadu_ps(&a[idx_m*6 + 48]); + m4 = _mm512_loadu_ps(&a[idx_m*6 + 64]); + m5 = _mm512_loadu_ps(&a[idx_m*6 + 80]); + + tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c0, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x07c0, m2, idx_c0, m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xf800, m4, idx_c0, m5); + c0 = _mm512_mask_blend_ps(0x07c0, tmp0, tmp1); + c0 = _mm512_mask_blend_ps(0xf800, c0, tmp2); + + tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c1, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x07c0, m2, idx_c1, m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xf800, m4, idx_c1, m5); + c1 = _mm512_mask_blend_ps(0x07c0, tmp0, tmp1); + c1 = _mm512_mask_blend_ps(0xf800, c1, tmp2); + + tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c2, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x07e0, m2, idx_c2, m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xf800, m4, idx_c2, m5); + c2 = _mm512_mask_blend_ps(0x07e0, tmp0, tmp1); + c2 = _mm512_mask_blend_ps(0xf800, c2, tmp2); + + tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c3, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x07e0, m2, idx_c3, m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xf800, m4, idx_c3, m5); + c3 = _mm512_mask_blend_ps(0x07e0, tmp0, tmp1); + c3 = _mm512_mask_blend_ps(0xf800, c3, tmp2); + + tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c4, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x03e0, m2, idx_c4, m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xfc00, m4, idx_c4, m5); + c4 = _mm512_mask_blend_ps(0x03e0, tmp0, tmp1); + c4 = _mm512_mask_blend_ps(0xfc00, c4, tmp2); + + tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c5 , m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x03e0, m2, idx_c5 , m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xfc00, m4, idx_c5 , m5); + c5 = _mm512_mask_blend_ps(0x03e0, tmp0, tmp1); + c5 = _mm512_mask_blend_ps(0xfc00, c5, tmp2); + + accum = _mm512_fmadd_ps(c1, x1_512, _mm512_mul_ps(c0, x0_512)); + accum = _mm512_fmadd_ps(c2, x2_512, accum); + accum = _mm512_fmadd_ps(c3, x3_512, accum); + accum = _mm512_fmadd_ps(c4, x4_512, accum); + accum = _mm512_fmadd_ps(c5, x5_512, accum); + accum = _mm512_fmadd_ps(accum, alpha_512, _mm512_loadu_ps(&y[idx_m])); + _mm512_storeu_ps(&y[idx_m], accum); + } + + if(tag_m_16x != m) { + __m512i idx_c0c3 = _mm512_set_epi32(29, 23, 17, 27, 21, 15, 9, 3, 26, 20, 30, 24, 18, 12, 6, 0); + __m512i idx_c1c4 = _mm512_add_epi32(idx_c0c3, M512_EPI32_1); + __m512i idx_c2c5 = _mm512_add_epi32(idx_c1c4, M512_EPI32_1); + idx_c2c5 = _mm512_mask_blend_epi32(0x0020, idx_c2c5, M512_EPI32_16); + __m256 c256_0, c256_1, c256_2, c256_3, c256_4, c256_5; + + __m256 x0_256 = _mm256_set1_ps(x[0]); + __m256 x1_256 = _mm256_set1_ps(x[1]); + __m256 x2_256 = _mm256_set1_ps(x[2]); + __m256 x3_256 = _mm256_set1_ps(x[3]); + __m256 x4_256 = _mm256_set1_ps(x[4]); + __m256 x5_256 = _mm256_set1_ps(x[5]); + __m256 alpha256 = _mm256_set1_ps(alpha); + __m256 accum_256; + + for(BLASLONG idx_m = tag_m_16x; idx_m \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup complex_lin * * ===================================================================== PROGRAM CCHKAA * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2017 * * ===================================================================== * @@ -156,9 +153,13 @@ $ NBVAL( MAXIN ), NBVAL2( MAXIN ), $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), $ RANKVAL( MAXIN ), PIV( NMAX ) - REAL RWORK( 150*NMAX+2*MAXRHS ), S( 2*NMAX ) - COMPLEX A( ( KDMAX+1 )*NMAX, 7 ), B( NMAX*MAXRHS, 4 ), - $ E( NMAX ), WORK( NMAX, NMAX+MAXRHS+10 ) + REAL S( 2*NMAX ) + COMPLEX E( NMAX ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + REAL, DIMENSION(:), ALLOCATABLE :: RWORK + COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK * .. * .. External Functions .. LOGICAL LSAME, LSAMEN @@ -194,6 +195,17 @@ * .. Data statements .. DATA THREQ / 2.0 / , INTSTR / '0123456789' / * .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK( NMAX, NMAX+MAXRHS+10 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* .. * .. Executable Statements .. * S1 = SECOND( ) @@ -1196,6 +1208,11 @@ S2 = SECOND( ) WRITE( NOUT, FMT = 9998 ) WRITE( NOUT, FMT = 9997 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) * 9999 FORMAT( / ' Execution not attempted due to input errors' ) 9998 FORMAT( / ' End of tests' ) diff --git a/lapack-netlib/TESTING/LIN/cchktsqr.f b/lapack-netlib/TESTING/LIN/cchktsqr.f index 8288916db..62b6ce434 100644 --- a/lapack-netlib/TESTING/LIN/cchktsqr.f +++ b/lapack-netlib/TESTING/LIN/cchktsqr.f @@ -159,6 +159,8 @@ * * Test the error exits * + CALL XLAENV( 1, 0 ) + CALL XLAENV( 2, 0 ) IF( TSTERR ) CALL CERRTSQR( PATH, NOUT ) INFOT = 0 * diff --git a/lapack-netlib/TESTING/LIN/dchkaa.f b/lapack-netlib/TESTING/LIN/dchkaa.F similarity index 96% rename from lapack-netlib/TESTING/LIN/dchkaa.f rename to lapack-netlib/TESTING/LIN/dchkaa.F index 03575c4d1..ef9d7808c 100644 --- a/lapack-netlib/TESTING/LIN/dchkaa.f +++ b/lapack-netlib/TESTING/LIN/dchkaa.F @@ -106,17 +106,14 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup double_lin * * ===================================================================== PROGRAM DCHKAA * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* Novemebr 2019 * * ===================================================================== * @@ -152,9 +149,12 @@ $ NBVAL( MAXIN ), NBVAL2( MAXIN ), $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), $ RANKVAL( MAXIN ), PIV( NMAX ) - DOUBLE PRECISION A( ( KDMAX+1 )*NMAX, 7 ), B( NMAX*MAXRHS, 4 ), - $ E( NMAX ), RWORK( 5*NMAX+2*MAXRHS ), - $ S( 2*NMAX ), WORK( NMAX, 3*NMAX+MAXRHS+30 ) + DOUBLE PRECISION E( NMAX ), S( 2*NMAX ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK + DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK * .. * .. External Functions .. LOGICAL LSAME, LSAMEN @@ -188,6 +188,18 @@ * .. Data statements .. DATA THREQ / 2.0D0 / , INTSTR / '0123456789' / * .. +* .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK( NMAX, 3*NMAX+MAXRHS+30 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* * .. Executable Statements .. * S1 = DSECND( ) @@ -677,7 +689,7 @@ * * SK: symmetric indefinite matrices, * with bounded Bunch-Kaufman (rook) pivoting algorithm, -* differnet matrix storage format than SR path version. +* different matrix storage format than SR path version. * NTYPES = 10 CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) @@ -1039,6 +1051,11 @@ S2 = DSECND( ) WRITE( NOUT, FMT = 9998 ) WRITE( NOUT, FMT = 9997 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) * 9999 FORMAT( / ' Execution not attempted due to input errors' ) 9998 FORMAT( / ' End of tests' ) diff --git a/lapack-netlib/TESTING/LIN/dchktsqr.f b/lapack-netlib/TESTING/LIN/dchktsqr.f index c4b1f01bd..14119e6e5 100644 --- a/lapack-netlib/TESTING/LIN/dchktsqr.f +++ b/lapack-netlib/TESTING/LIN/dchktsqr.f @@ -159,6 +159,8 @@ * * Test the error exits * + CALL XLAENV( 1, 0 ) + CALL XLAENV( 2, 0 ) IF( TSTERR ) CALL DERRTSQR( PATH, NOUT ) INFOT = 0 * diff --git a/lapack-netlib/TESTING/LIN/schkaa.f b/lapack-netlib/TESTING/LIN/schkaa.F similarity index 97% rename from lapack-netlib/TESTING/LIN/schkaa.f rename to lapack-netlib/TESTING/LIN/schkaa.F index a9c13e442..a5b826d06 100644 --- a/lapack-netlib/TESTING/LIN/schkaa.f +++ b/lapack-netlib/TESTING/LIN/schkaa.F @@ -104,17 +104,14 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup single_lin * * ===================================================================== PROGRAM SCHKAA * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2019 * * ===================================================================== * @@ -150,9 +147,12 @@ $ NBVAL( MAXIN ), NBVAL2( MAXIN ), $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), $ RANKVAL( MAXIN ), PIV( NMAX ) - REAL A( ( KDMAX+1 )*NMAX, 7 ), B( NMAX*MAXRHS, 4 ), - $ E( NMAX ), RWORK( 5*NMAX+2*MAXRHS ), - $ S( 2*NMAX ), WORK( NMAX, NMAX+MAXRHS+30 ) + REAL E( NMAX ), S( 2*NMAX ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + REAL, DIMENSION(:), ALLOCATABLE :: RWORK + REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK * .. * .. External Functions .. LOGICAL LSAME, LSAMEN @@ -186,6 +186,17 @@ * .. Data statements .. DATA THREQ / 2.0E0 / , INTSTR / '0123456789' / * .. +* .. Allocate memory dynamically .. +* + ALLOCATE (A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (WORK( NMAX, NMAX+MAXRHS+30 ) , STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* .. * .. Executable Statements .. * S1 = SECOND( ) @@ -1034,6 +1045,11 @@ S2 = SECOND( ) WRITE( NOUT, FMT = 9998 ) WRITE( NOUT, FMT = 9997 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) * 9999 FORMAT( / ' Execution not attempted due to input errors' ) 9998 FORMAT( / ' End of tests' ) diff --git a/lapack-netlib/TESTING/LIN/schktsqr.f b/lapack-netlib/TESTING/LIN/schktsqr.f index 2bed434a8..aa4d6f9c4 100644 --- a/lapack-netlib/TESTING/LIN/schktsqr.f +++ b/lapack-netlib/TESTING/LIN/schktsqr.f @@ -159,6 +159,8 @@ * * Test the error exits * + CALL XLAENV( 1, 0 ) + CALL XLAENV( 2, 0 ) IF( TSTERR ) CALL SERRTSQR( PATH, NOUT ) INFOT = 0 * diff --git a/lapack-netlib/TESTING/LIN/zchkaa.f b/lapack-netlib/TESTING/LIN/zchkaa.F similarity index 97% rename from lapack-netlib/TESTING/LIN/zchkaa.f rename to lapack-netlib/TESTING/LIN/zchkaa.F index 30d2a084a..a118515a5 100644 --- a/lapack-netlib/TESTING/LIN/zchkaa.f +++ b/lapack-netlib/TESTING/LIN/zchkaa.F @@ -110,17 +110,14 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup complex16_lin * * ===================================================================== PROGRAM ZCHKAA * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2019 * * ===================================================================== * @@ -156,9 +153,13 @@ $ NBVAL( MAXIN ), NBVAL2( MAXIN ), $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), $ RANKVAL( MAXIN ), PIV( NMAX ) - DOUBLE PRECISION RWORK( 150*NMAX+2*MAXRHS ), S( 2*NMAX ) - COMPLEX*16 A( ( KDMAX+1 )*NMAX, 7 ), B( NMAX*MAXRHS, 4 ), - $ E( NMAX ), WORK( NMAX, NMAX+MAXRHS+10 ) + DOUBLE PRECISION S( 2*NMAX ) + COMPLEX*16 E( NMAX ) +* +* .. Allocatable Arrays .. + INTEGER AllocateStatus + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE:: RWORK + COMPLEX*16, DIMENSION(:,:), ALLOCATABLE:: A, B, WORK * .. * .. External Functions .. LOGICAL LSAME, LSAMEN @@ -194,6 +195,16 @@ * .. * .. Data statements .. DATA THREQ / 2.0D0 / , INTSTR / '0123456789' / +* +* .. Allocate memory dynamically .. + ALLOCATE (RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (A ((KDMAX+1) * NMAX, 7), STAT = AllocateStatus) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (B (NMAX * MAXRHS, 4), STAT = AllocateStatus) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (WORK (NMAX, NMAX+MAXRHS+10), STAT = AllocateStatus) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" * .. * .. Executable Statements .. * @@ -1231,6 +1242,11 @@ S2 = DSECND( ) WRITE( NOUT, FMT = 9998 ) WRITE( NOUT, FMT = 9997 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) * 9999 FORMAT( / ' Execution not attempted due to input errors' ) 9998 FORMAT( / ' End of tests' ) diff --git a/lapack-netlib/TESTING/LIN/zchktsqr.f b/lapack-netlib/TESTING/LIN/zchktsqr.f index e6e6ac556..678b1772f 100644 --- a/lapack-netlib/TESTING/LIN/zchktsqr.f +++ b/lapack-netlib/TESTING/LIN/zchktsqr.f @@ -159,6 +159,8 @@ * * Test the error exits * + CALL XLAENV( 1, 0 ) + CALL XLAENV( 2, 0 ) IF( TSTERR ) CALL ZERRTSQR( PATH, NOUT ) INFOT = 0 * diff --git a/openblas_config_template.h b/openblas_config_template.h index 858b8c5cb..1e17c9a16 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -99,5 +99,6 @@ typedef int blasint; /* Inclusion of Linux-specific header is needed for definition of cpu_set_t. */ #ifdef OPENBLAS_OS_LINUX +#define _GNU_SOURCE #include #endif diff --git a/param.h b/param.h index a35ce69bd..01048023f 100644 --- a/param.h +++ b/param.h @@ -72,13 +72,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef PARAM_H #define PARAM_H -#define LONGCAST (BLASLONG) -#if defined(__BYTE_ORDER__) -#if __GNUC__ < 9 -#undef LONGCAST -#define LONGCAST -#endif -#endif #define SBGEMM_DEFAULT_UNROLL_N 4 #define SBGEMM_DEFAULT_UNROLL_M 8 @@ -2096,7 +2089,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef PPCG4 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 1024 -#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2127,7 +2120,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 2688 #define GEMM_DEFAULT_OFFSET_B 3072 -#define GEMM_DEFAULT_ALIGN LONGCAST 0x03fffUL +#define GEMM_DEFAULT_ALIGN 0x03fffUL #if defined(__BYTE_ORDER__)&&(__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) #define SGEMM_DEFAULT_UNROLL_M 4 @@ -2176,7 +2169,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A (32 * 0) #define GEMM_DEFAULT_OFFSET_B (32 * 0) -#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2212,7 +2205,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A (32 * 0) #define GEMM_DEFAULT_OFFSET_B (32 * 0) -#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2247,7 +2240,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER3) || defined(POWER4) || defined(POWER5) #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 2048 -#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2271,6 +2264,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_Q 216 #define DGEMM_DEFAULT_R 1012 +#define CGEMM_DEFAULT_P 256 +#define CGEMM_DEFAULT_Q 104 +#define CGEMM_DEFAULT_R 1012 + #define ZGEMM_DEFAULT_P 256 #define ZGEMM_DEFAULT_Q 104 #define ZGEMM_DEFAULT_R 1012 @@ -2288,6 +2285,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_P 144 #define ZGEMM_DEFAULT_P 144 #endif + +#define SGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 256 #endif #if defined(POWER5) @@ -2320,7 +2322,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 384 #define GEMM_DEFAULT_OFFSET_B 1024 -#define GEMM_DEFAULT_ALIGN LONGCAST 0x03fffUL +#define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2353,7 +2355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 -#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #if defined(__32BIT__) #warning using BINARY32==POWER6 #define SGEMM_DEFAULT_UNROLL_M 4 @@ -2406,7 +2408,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 -#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SWITCH_RATIO 16 #define GEMM_PREFERED_SIZE 16 @@ -2445,7 +2447,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 -#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SWITCH_RATIO 16 #define GEMM_PREFERED_SIZE 16 @@ -2957,7 +2959,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 2048 -#elif defined(CORTEXA53) +#elif defined(CORTEXA53) || defined(CORTEXA55) #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 8 diff --git a/relapack/src/lapack_wrappers.c.orig b/relapack/src/lapack_wrappers.c.orig deleted file mode 100644 index d89d2fe2f..000000000 --- a/relapack/src/lapack_wrappers.c.orig +++ /dev/null @@ -1,607 +0,0 @@ -#include "relapack.h" - -//////////// -// XLAUUM // -//////////// - -#if INCLUDE_SLAUUM -void LAPACK(slauum)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_slauum(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_DLAUUM -void LAPACK(dlauum)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_dlauum(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_CLAUUM -void LAPACK(clauum)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_clauum(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_ZLAUUM -void LAPACK(zlauum)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_zlauum(uplo, n, A, ldA, info); -} -#endif - - -//////////// -// XSYGST // -//////////// - -#if INCLUDE_SSYGST -void LAPACK(ssygst)( - const int *itype, const char *uplo, const int *n, - float *A, const int *ldA, const float *B, const int *ldB, - int *info -) { - RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info); -} -#endif - -#if INCLUDE_DSYGST -void LAPACK(dsygst)( - const int *itype, const char *uplo, const int *n, - double *A, const int *ldA, const double *B, const int *ldB, - int *info -) { - RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info); -} -#endif - -#if INCLUDE_CSYGST -void LAPACK(csygst)( - const int *itype, const char *uplo, const int *n, - float *A, const int *ldA, const float *B, const int *ldB, - int *info -) { - RELAPACK_csygst(itype, uplo, n, A, ldA, B, ldB, info); -} -#endif - -#if INCLUDE_ZSYGST -void LAPACK(zsygst)( - const int *itype, const char *uplo, const int *n, - double *A, const int *ldA, const double *B, const int *ldB, - int *info -) { - RELAPACK_zsygst(itype, uplo, n, A, ldA, B, ldB, info); -} -#endif - - -//////////// -// XTRTRI // -//////////// - -#if INCLUDE_STRTRI -void LAPACK(strtri)( - const char *uplo, const char *diag, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_strtri(uplo, diag, n, A, ldA, info); -} -#endif - -#if INCLUDE_DTRTRI -void LAPACK(dtrtri)( - const char *uplo, const char *diag, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_dtrtri(uplo, diag, n, A, ldA, info); -} -#endif - -#if INCLUDE_CTRTRI -void LAPACK(ctrtri)( - const char *uplo, const char *diag, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_ctrtri(uplo, diag, n, A, ldA, info); -} -#endif - -#if INCLUDE_ZTRTRI -void LAPACK(ztrtri)( - const char *uplo, const char *diag, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_ztrtri(uplo, diag, n, A, ldA, info); -} -#endif - - -//////////// -// XPOTRF // -//////////// - -#if INCLUDE_SPOTRF -void LAPACK(spotrf)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_spotrf(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_DPOTRF -void LAPACK(dpotrf)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_dpotrf(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_CPOTRF -void LAPACK(cpotrf)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_cpotrf(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_ZPOTRF -void LAPACK(zpotrf)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_zpotrf(uplo, n, A, ldA, info); -} -#endif - - -//////////// -// XPBTRF // -//////////// - -#if INCLUDE_SPBTRF -void LAPACK(spbtrf)( - const char *uplo, const int *n, const int *kd, - float *Ab, const int *ldAb, - int *info -) { - RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info); -} -#endif - -#if INCLUDE_DPBTRF -void LAPACK(dpbtrf)( - const char *uplo, const int *n, const int *kd, - double *Ab, const int *ldAb, - int *info -) { - RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info); -} -#endif - -#if INCLUDE_CPBTRF -void LAPACK(cpbtrf)( - const char *uplo, const int *n, const int *kd, - float *Ab, const int *ldAb, - int *info -) { - RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info); -} -#endif - -#if INCLUDE_ZPBTRF -void LAPACK(zpbtrf)( - const char *uplo, const int *n, const int *kd, - double *Ab, const int *ldAb, - int *info -) { - RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info); -} -#endif - - -//////////// -// XSYTRF // -//////////// - -#if INCLUDE_SSYTRF -void LAPACK(ssytrf)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_DSYTRF -void LAPACK(dsytrf)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_CSYTRF -void LAPACK(csytrf)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_ZSYTRF -void LAPACK(zsytrf)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_CHETRF -void LAPACK(chetrf)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_ZHETRF -void LAPACK(zhetrf)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_SSYTRF_ROOK -void LAPACK(ssytrf_rook)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_DSYTRF_ROOK -void LAPACK(dsytrf_rook)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_CSYTRF_ROOK -void LAPACK(csytrf_rook)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_ZSYTRF_ROOK -void LAPACK(zsytrf_rook)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_CHETRF_ROOK -void LAPACK(chetrf_rook)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_ZHETRF_ROOK -void LAPACK(zhetrf_rook)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - - -//////////// -// XGETRF // -//////////// - -#if INCLUDE_SGETRF -void LAPACK(sgetrf)( - const int *m, const int *n, - float *A, const int *ldA, int *ipiv, - int *info -) { - RELAPACK_sgetrf(m, n, A, ldA, ipiv, info); -} -#endif - -#if INCLUDE_DGETRF -void LAPACK(dgetrf)( - const int *m, const int *n, - double *A, const int *ldA, int *ipiv, - int *info -) { - RELAPACK_dgetrf(m, n, A, ldA, ipiv, info); -} -#endif - -#if INCLUDE_CGETRF -void LAPACK(cgetrf)( - const int *m, const int *n, - float *A, const int *ldA, int *ipiv, - int *info -) { - RELAPACK_cgetrf(m, n, A, ldA, ipiv, info); -} -#endif - -#if INCLUDE_ZGETRF -void LAPACK(zgetrf)( - const int *m, const int *n, - double *A, const int *ldA, int *ipiv, - int *info -) { - RELAPACK_zgetrf(m, n, A, ldA, ipiv, info); -} -#endif - - -//////////// -// XGBTRF // -//////////// - -#if INCLUDE_SGBTRF -void LAPACK(sgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - float *Ab, const int *ldAb, int *ipiv, - int *info -) { - RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); -} -#endif - -#if INCLUDE_DGBTRF -void LAPACK(dgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - double *Ab, const int *ldAb, int *ipiv, - int *info -) { - RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); -} -#endif - -#if INCLUDE_CGBTRF -void LAPACK(cgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - float *Ab, const int *ldAb, int *ipiv, - int *info -) { - RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); -} -#endif - -#if INCLUDE_ZGBTRF -void LAPACK(zgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - double *Ab, const int *ldAb, int *ipiv, - int *info -) { - RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); -} -#endif - - -//////////// -// XTRSYL // -//////////// - -#if INCLUDE_STRSYL -void LAPACK(strsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, float *scale, - int *info -) { - RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); -} -#endif - -#if INCLUDE_DTRSYL -void LAPACK(dtrsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, double *scale, - int *info -) { - RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); -} -#endif - -#if INCLUDE_CTRSYL -void LAPACK(ctrsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, float *scale, - int *info -) { - RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); -} -#endif - -#if INCLUDE_ZTRSYL -void LAPACK(ztrsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, double *scale, - int *info -) { - RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); -} -#endif - - -//////////// -// XTGSYL // -//////////// - -#if INCLUDE_STGSYL -void LAPACK(stgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, - const float *D, const int *ldD, const float *E, const int *ldE, - float *F, const int *ldF, - float *scale, float *dif, - float *Work, const int *lWork, int *iWork, int *info -) { - RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); -} -#endif - -#if INCLUDE_DTGSYL -void LAPACK(dtgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, - const double *D, const int *ldD, const double *E, const int *ldE, - double *F, const int *ldF, - double *scale, double *dif, - double *Work, const int *lWork, int *iWork, int *info -) { - RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); -} -#endif - -#if INCLUDE_CTGSYL -void LAPACK(ctgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, - const float *D, const int *ldD, const float *E, const int *ldE, - float *F, const int *ldF, - float *scale, float *dif, - float *Work, const int *lWork, int *iWork, int *info -) { - RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); -} -#endif - -#if INCLUDE_ZTGSYL -void LAPACK(ztgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, - const double *D, const int *ldD, const double *E, const int *ldE, - double *F, const int *ldF, - double *scale, double *dif, - double *Work, const int *lWork, int *iWork, int *info -) { - RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); -} -#endif - - -//////////// -// XGEMMT // -//////////// - -#if INCLUDE_SGEMMT -void LAPACK(sgemmt)( - const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC -) { - RELAPACK_sgemmt(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_DGEMMT -void LAPACK(dgemmt)( - const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC -) { - RELAPACK_dgemmt(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_CGEMMT -void LAPACK(cgemmt)( - const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC -) { - RELAPACK_cgemmt(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_ZGEMMT -void LAPACK(zgemmt)( - const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC -) { - RELAPACK_zgemmt(uplo, n, A, ldA, info); -} -#endif diff --git a/test/Makefile b/test/Makefile index 54fa60533..6c5f041c2 100644 --- a/test/Makefile +++ b/test/Makefile @@ -259,10 +259,6 @@ endif FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) -ifeq ($(CORE), C910V) -EXTRALIB = -CEXTRALIB = -endif ifeq ($(USE_OPENMP), 1) ifeq ($(F_COMPILER), GFORTRAN) diff --git a/utest/ctest.h b/utest/ctest.h index d316b1494..037f7f28d 100644 --- a/utest/ctest.h +++ b/utest/ctest.h @@ -28,7 +28,10 @@ #define WEAK #endif +#ifndef __MSC_VER #include /* intmax_t, uintmax_t, PRI* */ +#endif + #include /* size_t */ typedef void (*SetupFunc)(void*); @@ -72,6 +75,13 @@ struct ctest { #define __CTEST_NO_TIME #define CTEST_NO_COLORS +#if __MSC_VER >= 1500 +#include +#else +#include +#define CTEST_NO_INTTYPES +#endif + #ifndef CTEST_ADD_TESTS_MANUALLY #pragma section(".ctest$a") #pragma section(".ctest$u") @@ -480,11 +490,19 @@ void assert_data(const unsigned char* exp, size_t expsize, const char* caller, int line) { size_t i; if (expsize != realsize) { +#ifndef CTEST_NO_INTTYPES CTEST_ERR("%s:%d expected %" PRIuMAX " bytes, got %" PRIuMAX, caller, line, (uintmax_t) expsize, (uintmax_t) realsize); +#else + CTEST_ERR("%s:%d expected %u bytes, got %u", caller, line, (uintmax_t) expsize, (uintmax_t) realsize); +#endif } for (i=0; i exp2) { +#ifndef CTEST_NO_INTTYPES CTEST_ERR("%s:%d expected %" PRIdMAX "-%" PRIdMAX ", got %" PRIdMAX, caller, line, exp1, exp2, real); +#else + CTEST_ERR("%s:%d expected %d-%d, got %d", caller, line, exp1, exp2, real); +#endif } }