diff --git a/.drone.yml b/.drone.yml index b1c211d14..38ded2015 100644 --- a/.drone.yml +++ b/.drone.yml @@ -190,3 +190,27 @@ steps: - make -C ctest $COMMON_FLAGS - make -C utest $COMMON_FLAGS - make -C cpp_thread_test dgemm_tester +--- +kind: pipeline +name: arm64_gcc10 + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: ubuntu:20.04 + environment: + CC: gcc-10 + FC: gfortran-10 + COMMON_FLAGS: 'TARGET=ARMV8 DYNAMIC_ARCH=1' + commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" + - apt-get update -y + - apt-get install -y make $CC gfortran-10 perl python g++ + - $CC --version + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C utest $COMMON_FLAGS + - make -C test $COMMON_FLAGS + diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index 8d7cfea2d..b025f8634 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -44,6 +44,11 @@ jobs: if: github.event_name != 'pull_request' run: brew update || true + - name: unlink installed gcc to allow updating + run: | + brew unlink gcc@8 + brew unlink gcc@9 + - name: Install prerequisites run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas diff --git a/.gitignore b/.gitignore index bca79f043..0fe20ecaa 100644 --- a/.gitignore +++ b/.gitignore @@ -89,5 +89,7 @@ build.* *.swp benchmark/*.goto benchmark/smallscaling +.vscode CMakeCache.txt CMakeFiles/* +.vscode diff --git a/CMakeLists.txt b/CMakeLists.txt index 12730e0e3..3107ef9a9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 13) +set(OpenBLAS_PATCH_VERSION 14) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions @@ -14,6 +14,9 @@ include(GNUInstallDirs) include(CMakePackageConfigHelpers) +if(MSVC AND NOT DEFINED NOFORTRAN) + set(NOFORTRAN ON) +endif() ####### if(MSVC) @@ -229,7 +232,7 @@ if (NOT NO_CBLAS) add_subdirectory(utest) endif() -if (NOT MSVC AND NOT NOFORTRAN) +if (NOT NOFORTRAN) # Build test and ctest add_subdirectory(test) if(NOT NO_CBLAS) diff --git a/Changelog.txt b/Changelog.txt index cbc7007ac..5662bc5c6 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,52 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.14 + 17-Mar-2021 + + common: + * Fixed a race condition on thread shutdown in non-OpenMP builds + * Fixed custom BUFFERSIZE option getting ignored in gmake builds + * Fixed CMAKE compilation of the TRMM kernels for GENERIC platforms + * Added CBLAS interfaces for CROTG, ZROTG, CSROT and ZDROT + * Improved performance of OMATCOPY_RT across all platforms + * Changed perl scripts to use env instead of a hardcoded /usr/bin/perl + * Fixed potential misreading of the GCC compiler version in the build scripts + * Fixed convergence problems in LAPACK complex GGEV/GGES (Reference-LAPACK #477) + * Reduced the stacksize requirements for running the LAPACK testsuite (Reference-LAPACK #335) + + RISCV: + * Fixed compilation on RISCV (missing entry in getarch) + + POWER: + * Fixed compilation for DYNAMIC_ARCH with clang and with old gcc versions + * Added support for compilation on FreeBSD/ppc64le + * Added optimized POWER10 kernels for SSCAL, DSCAL, CSCAL, ZSCAL + * Added optimized POWER10 kernels for SROT, DROT, CDOT, SASUM, DASUM + * Improved SSWAP, DSWAP, CSWAP, ZSWAP performance on POWER10 + * Improved SCOPY and CCOPY performance on POWER10 + * Improved SGEMM and DGEMM performance on POWER10 + * Added support for compilation with the NVIDIA HPC compiler + + x86_64: + * Added an optimized bfloat16 GEMM kernel for Cooperlake + * Added CPUID autodetection for Intel Rocket Lake and Tiger Lake cpus + * Improved the performance of SASUM,DASUM,SROT,DROT on AMD Ryzen cpus + * Added support for compilation with the NAG Fortran compiler + * Fixed recognition of the AMD AOCC compiler + * Fixed compilation for DYNAMIC_ARCH with clang on Windows + * Added support for running the BLAS/CBLAS tests on Windows + * Fixed signatures of the tls callback functions for Windows x64 + * Fixed various issues with fma intrinsics support handling + + ARM: + * Added support for embedded Cortex M targets via a new option EMBEDDED + + ARMV8: + * Fixed the THUNDERX2T99 and NEOVERSEN1 DNRM2/ZNRM2 kernels for inputs with Inf + * Added support for the DYNAMIC_LIST option + * Added support for compilation with the NVIDIA HPC compiler + * Added support for compiling with the NAG Fortran compiler + ==================================================================== Version 0.3.13 12-Dec-2020 diff --git a/Makefile b/Makefile index 54dd3be41..de0735c4a 100644 --- a/Makefile +++ b/Makefile @@ -59,6 +59,9 @@ endif @$(CC) --version > /dev/null 2>&1;\ if [ $$? -eq 0 ]; then \ cverinfo=`$(CC) --version | sed -n '1p'`; \ + if [ -z "$${cverinfo}" ]; then \ + cverinfo=`$(CC) --version | sed -n '2p'`; \ + fi; \ echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\ else \ echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\ @@ -67,6 +70,9 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) @$(FC) --version > /dev/null 2>&1;\ if [ $$? -eq 0 ]; then \ fverinfo=`$(FC) --version | sed -n '1p'`; \ + if [ -z "$${fverinfo}" ]; then \ + fverinfo=`$(FC) --version | sed -n '2p'`; \ + fi; \ echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\ else \ echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ diff --git a/Makefile.arm64 b/Makefile.arm64 index 62a877fff..23362b4e5 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -1,28 +1,38 @@ - +ifneq ($(C_COMPILER), PGI) ifeq ($(CORE), ARMV8) CCOMMON_OPT += -march=armv8-a +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a endif +endif ifeq ($(CORE), CORTEXA53) CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 endif +endif ifeq ($(CORE), CORTEXA57) CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57 endif +endif ifeq ($(CORE), CORTEXA72) CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 endif +endif ifeq ($(CORE), CORTEXA73) CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 endif +endif # Use a72 tunings because Neoverse-N1 is only available # in GCC>=9 @@ -30,51 +40,71 @@ ifeq ($(CORE), NEOVERSEN1) ifeq ($(GCCVERSIONGTEQ7), 1) ifeq ($(GCCVERSIONGTEQ9), 1) CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 +endif else CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 endif +endif else CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 endif endif +endif ifeq ($(CORE), THUNDERX) CCOMMON_OPT += -march=armv8-a -mtune=thunderx +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=thunderx endif +endif ifeq ($(CORE), FALKOR) CCOMMON_OPT += -march=armv8-a -mtune=falkor +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=falkor endif +endif ifeq ($(CORE), THUNDERX2T99) CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif +endif ifeq ($(CORE), THUNDERX3T110) ifeq ($(GCCVERSIONGTEQ10), 1) CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 +endif else CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif endif +endif ifeq ($(CORE), VORTEX) CCOMMON_OPT += -march=armv8.3-a +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.3-a endif +endif ifeq ($(GCCVERSIONGTEQ9), 1) ifeq ($(CORE), TSV110) CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 endif endif - +endif +endif diff --git a/Makefile.power b/Makefile.power index c7e972290..946f55232 100644 --- a/Makefile.power +++ b/Makefile.power @@ -10,9 +10,11 @@ USE_OPENMP = 1 endif ifeq ($(CORE), POWER10) +ifneq ($(C_COMPILER), PGI) CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math endif +endif ifeq ($(CORE), POWER9) ifneq ($(C_COMPILER), PGI) diff --git a/Makefile.rule b/Makefile.rule index e4b82104e..5a46bf6b0 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.13 +VERSION = 0.3.14 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/Makefile.system b/Makefile.system index 5adde36d8..ae703e4d9 100644 --- a/Makefile.system +++ b/Makefile.system @@ -21,6 +21,8 @@ ifeq ($(ARCH), amd64) override ARCH=x86_64 else ifeq ($(ARCH), powerpc64) override ARCH=power +else ifeq ($(ARCH), powerpc64le) +override ARCH=power else ifeq ($(ARCH), powerpc) override ARCH=power else ifeq ($(ARCH), i386) @@ -181,7 +183,7 @@ endif # On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch. ifeq ($(HOSTARCH), x86_64) -ifeq ($(findstring pgcc,$(HOSTCC)),) +ifeq ($(findstring pgcc,$(HOSTCC))$(findstring nvc,$(HOSTCC)),) GETARCH_FLAGS += -march=native endif endif @@ -623,6 +625,11 @@ DYNAMIC_CORE += THUNDERX2T99 DYNAMIC_CORE += TSV110 DYNAMIC_CORE += EMAG8180 DYNAMIC_CORE += THUNDERX3T110 +ifdef DYNAMIC_LIST +override DYNAMIC_CORE = ARMV8 $(DYNAMIC_LIST) +XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_ARMV8 +XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) +endif endif ifeq ($(ARCH), mips64) @@ -663,6 +670,7 @@ endif endif # ARCH zarch ifeq ($(ARCH), power) +ifneq ($(C_COMPILER), PGI) DYNAMIC_CORE = POWER6 DYNAMIC_CORE += POWER8 ifneq ($(C_COMPILER), GCC) @@ -689,6 +697,10 @@ else $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) endif endif +else +DYNAMIC_CORE = POWER8 +DYNAMIC_CORE += POWER9 +endif endif # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty @@ -847,9 +859,19 @@ endif endif ifeq ($(C_COMPILER), PGI) +PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20) +PGCVERSIONGTEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \>= 20) +PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -c 4-5` == 11) +PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11) +ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 110 111 011)) +NEWPGI := 1 +endif ifdef BINARY64 ifeq ($(ARCH), x86_64) -CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm +CCOMMON_OPT += -tp p7-64 +ifneq ($(NEWPGI),1) +CCOMMON_OPT += -D__MMX__ -Mnollvm +endif else ifeq ($(ARCH), power) ifeq ($(CORE), POWER8) @@ -877,13 +899,25 @@ endif # Fortran Compiler dependent settings # +ifeq ($(F_COMPILER), NAG) +FCOMMON_OPT += -dcfuns -recursive -ieee=full -w=obs -thread_safe +ifdef INTERFACE64 +ifneq ($(INTERFACE64), 0) +FCOMMON_OPT += -i8 +endif +endif +ifeq ($(USE_OPENMP), 1) +FCOMMON_OPT += -openmp +endif +endif + ifeq ($(F_COMPILER), FLANG) CCOMMON_OPT += -DF_INTERFACE_FLANG FCOMMON_OPT += -Mrecursive -Kieee ifeq ($(OSNAME), Linux) ifeq ($(ARCH), x86_64) -FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`) -ifeq ($(FLANG_VENDOR),AOCC) +FLANG_VENDOR := $(shell $(FC) --version|head -1 |cut -f 1 -d " ") +ifeq ($(FLANG_VENDOR), AMD) FCOMMON_OPT += -fno-unroll-loops endif endif @@ -1029,18 +1063,24 @@ ifeq ($(ARCH), x86_64) FCOMMON_OPT += -tp p7-64 else ifeq ($(ARCH), power) +ifeq ($(CORE), POWER6) +$(warning NVIDIA HPC compilers do not support POWER6.) +endif ifeq ($(CORE), POWER8) FCOMMON_OPT += -tp pwr8 endif ifeq ($(CORE), POWER9) FCOMMON_OPT += -tp pwr9 endif +ifeq ($(CORE), POWER10) +$(warning NVIDIA HPC compilers do not support POWER10.) +endif endif endif else FCOMMON_OPT += -tp p7 endif -FCOMMON_OPT += -Mrecursive +FCOMMON_OPT += -Mrecursive -Kieee ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -mp endif @@ -1179,6 +1219,8 @@ CCOMMON_OPT += -fPIC endif ifeq ($(F_COMPILER), SUN) FCOMMON_OPT += -pic +else ifeq ($(F_COMPILER), NAG) +FCOMMON_OPT += -PIC else FCOMMON_OPT += -fPIC endif @@ -1256,6 +1298,10 @@ CCOMMON_OPT += -DUSE_PAPI EXTRALIB += -lpapi -lperfctr endif +ifdef BUFFERSIZE +CCOMMON_OPT += -DBUFFERSIZE=$(BUFFERSIZE) +endif + ifdef DYNAMIC_THREADS CCOMMON_OPT += -DDYNAMIC_THREADS endif @@ -1433,6 +1479,10 @@ LAPACK_FFLAGS := $(FFLAGS) LAPACK_FPFLAGS := $(FPFLAGS) endif +ifeq ($(F_COMPILER),NAG) +LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) +endif + LAPACK_CFLAGS = $(CFLAGS) LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H ifdef INTERFACE64 diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 00967bcb6..5406494c9 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -10,40 +10,46 @@ endif ifdef HAVE_SSE3 CCOMMON_OPT += -msse3 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -msse3 endif +endif ifdef HAVE_SSSE3 CCOMMON_OPT += -mssse3 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -mssse3 endif +endif ifdef HAVE_SSE4_1 CCOMMON_OPT += -msse4.1 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -msse4.1 endif +endif ifndef OLDGCC ifdef HAVE_AVX CCOMMON_OPT += -mavx +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -mavx endif endif +endif ifndef NO_AVX2 ifdef HAVE_AVX2 CCOMMON_OPT += -mavx2 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -mavx2 endif endif -ifndef OLDGCC -ifdef HAVE_FMA3 -CCOMMON_OPT += -mfma -FCOMMON_OPT += -mfma -endif endif ifeq ($(CORE), SKYLAKEX) ifndef DYNAMIC_ARCH ifndef NO_AVX512 CCOMMON_OPT += -march=skylake-avx512 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=skylake-avx512 +endif ifeq ($(OSNAME), CYGWIN_NT) CCOMMON_OPT += -fno-asynchronous-unwind-tables FCOMMON_OPT += -fno-asynchronous-unwind-tables @@ -65,9 +71,11 @@ ifeq ($(C_COMPILER), GCC) # cooperlake support was added in 10.1 ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) CCOMMON_OPT += -march=cooperlake +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=cooperlake endif endif +endif ifeq ($(OSNAME), CYGWIN_NT) CCOMMON_OPT += -fno-asynchronous-unwind-tables FCOMMON_OPT += -fno-asynchronous-unwind-tables diff --git a/README.md b/README.md index 267df5358..174f951f4 100644 --- a/README.md +++ b/README.md @@ -13,10 +13,14 @@ Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/sta ## Introduction -OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. +OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version. Please read the documentation on the OpenBLAS wiki pages: . +For a general introduction to the BLAS routines, please refer to the extensive documentation of their reference implementation hosted at netlib: +. On that site you will likewise find documentation for the reference implementation of the higher-level library LAPACK - the **L**inear **A**lgebra **Pack**age that comes included with OpenBLAS. If you are looking for a general primer or refresher on Linear Algebra, the set of six +20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare or Youtube may be helpful. + ## Binary Packages We provide official binary packages for the following platform: @@ -208,7 +212,8 @@ Please note that it is not possible to combine support for different architectur - **Android**: Supported by the community. Please read . - **AIX**: Supported on PPC up to POWER8 - **Haiku**: Supported by the community. We don't actively test the library on this OS. -- **SunOS**: Supported by the community. We don't actively test the library on this OS: +- **SunOS**: Supported by the community. We don't actively test the library on this OS. +- **Cortex-M**: Supported by the community. Please read . ## Usage diff --git a/appveyor.yml b/appveyor.yml index 1936059d5..c9b2fa3a1 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -30,10 +30,10 @@ environment: CONDA_INSTALL_LOCN: C:\\Miniconda36-x64 matrix: - COMPILER: clang-cl - WITH_FORTRAN: yes + WITH_FORTRAN: ON - COMPILER: clang-cl DYNAMIC_ARCH: ON - WITH_FORTRAN: no + WITH_FORTRAN: OFF - COMPILER: cl - COMPILER: MinGW64-gcc-7.2.0-mingw DYNAMIC_ARCH: OFF @@ -47,12 +47,7 @@ environment: install: - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force - - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake - - - if [%WITH_FORTRAN%]==[no] conda install --yes --quiet ninja - - if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet -c isuruf kitware-ninja - - if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet flang - + - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1 - if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 - if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%" - if [%COMPILER%]==[clang-cl] set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%" @@ -68,15 +63,14 @@ before_build: - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. - if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. - - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. + - if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. + - if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. + - if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. build_script: - cmake --build . test_script: - - echo Running Test - - cd utest - - openblas_utest + - ctest -j2 diff --git a/benchmark/bench.h b/benchmark/bench.h index 1f9b8986c..83de8ab2b 100644 --- a/benchmark/bench.h +++ b/benchmark/bench.h @@ -74,6 +74,9 @@ static void *huge_malloc(BLASLONG size){ #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) struct timeval start, stop; +#elif defined(__APPLE__) + mach_timebase_info_data_t info; + uint64_t start = 0, stop = 0; #else struct timespec start = { 0, 0 }, stop = { 0, 0 }; #endif @@ -82,6 +85,9 @@ double getsec() { #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; +#elif defined(__APPLE__) + mach_timebase_info(&info); + return (double)(((stop - start) * info.numer)/info.denom) * 1.e-9; #else return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9; #endif @@ -90,6 +96,8 @@ double getsec() void begin() { #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) gettimeofday( &start, (struct timezone *)0); +#elif defined(__APPLE__) + start = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); #else clock_gettime(CLOCK_REALTIME, &start); #endif @@ -98,7 +106,9 @@ void begin() { void end() { #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) gettimeofday( &stop, (struct timezone *)0); +#elif defined(__APPLE__) + stop = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); #else clock_gettime(CLOCK_REALTIME, &stop); #endif -} \ No newline at end of file +} diff --git a/c_check b/c_check index 970d475d7..e24943a29 100644 --- a/c_check +++ b/c_check @@ -1,11 +1,11 @@ -#!/usr/bin/perl +#!/usr/bin/env perl #use File::Basename; # use File::Temp qw(tempfile); # Checking cross compile $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); -$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); +$hostarch = `uname -m | sed -e s/i.86/x86/`; $hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS"); chop($hostarch); $hostarch = "x86_64" if ($hostarch eq "amd64"); diff --git a/cblas.h b/cblas.h index da00d46d6..f0220eb99 100644 --- a/cblas.h +++ b/cblas.h @@ -125,9 +125,14 @@ void cblas_zswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx, void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); +void cblas_csrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); +void cblas_zdrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); void cblas_srotg(float *a, float *b, float *c, float *s); void cblas_drotg(double *a, double *b, double *c, double *s); +void cblas_crotg(void *a, void *b, float *c, void *s); +void cblas_zrotg(void *a, void *b, double *c, void *s); + void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P); void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P); diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 5457bfb07..4451f9eaa 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -45,6 +45,9 @@ endif () if (DYNAMIC_ARCH) if (ARM64) set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) + if (DYNAMIC_LIST) + set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) + endif () endif () if (POWER) diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake index f10905c4d..54a583887 100644 --- a/cmake/lapacke.cmake +++ b/cmake/lapacke.cmake @@ -2499,6 +2499,5 @@ foreach (Utils_FILE ${Utils_SRC}) endforeach () set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include") -configure_file("${lapacke_include_dir}/lapacke_mangling_with_flags.h.in" "${lapacke_include_dir}/lapacke_mangling.h" COPYONLY) include_directories(${lapacke_include_dir}) set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") diff --git a/cmake/system.cmake b/cmake/system.cmake index 66e95c6d3..eee429113 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -148,16 +148,20 @@ endif () include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") if (DEFINED TARGET) if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512) -# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 10.09) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") else() set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") endif() -# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") -# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") -# endif() + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 8.99) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") + else() + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + endif() + endif() endif() if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") @@ -233,6 +237,11 @@ if (BINARY64) endif () endif () +if(EMBEDDED) + set(CCOMMON_OPT "${CCOMMON_OPT} -DOS_EMBEDDED") + set(CCOMMON_OPT "${CCOMMON_OPT} -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16") +endif() + if (NEED_PIC) if (${CMAKE_C_COMPILER} STREQUAL "IBM") set(CCOMMON_OPT "${CCOMMON_OPT} -qpic=large") diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 8f25c1b27..29b5a067b 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -74,6 +74,9 @@ macro(ParseMakefileVars MAKEFILE_IN) string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") # message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") + if ( ${CMAKE_MATCH_1} STREQUAL C_COMPILER) + set (CMAKE_MATCH_1 CMAKE_C_COMPILER) + endif () if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})) # message (STATUS "condition is true") set (IfElse 1) diff --git a/common.h b/common.h index 2825407cb..862e0b4db 100644 --- a/common.h +++ b/common.h @@ -122,7 +122,7 @@ extern "C" { #define ATOM GOTO_ATOM #undef GOTO_ATOM #endif -#else +#elif !defined(OS_EMBEDDED) #include #ifndef NO_SYSV_IPC #include @@ -134,6 +134,9 @@ extern "C" { #if defined(SMP) || defined(USE_LOCKING) #include #endif +#else +#include +#include #endif #if defined(OS_SUNOS) @@ -488,10 +491,12 @@ static inline unsigned long long rpcc(void){ struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec; -#else +#elif !defined(OS_EMBEDDED) struct timeval tv; gettimeofday(&tv,NULL); return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000; +#else + return 0; #endif } #define RPCC_DEFINED @@ -521,6 +526,10 @@ static void __inline blas_lock(volatile BLASULONG *address){ #include "common_linux.h" #endif +#ifdef OS_EMBEDDED +#define DTB_DEFAULT_ENTRIES 64 +#endif + #define MMAP_ACCESS (PROT_READ | PROT_WRITE) #ifdef __NetBSD__ diff --git a/common_arm64.h b/common_arm64.h index 9cdded305..2270ffba7 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define INLINE inline -#ifdef F_INTERFACE_FLANG +#if defined( F_INTERFACE_FLANG) || defined(F_INTERFACE_PGI) #define RETURN_BY_STACK #else #define RETURN_BY_COMPLEX diff --git a/cpuid_x86.c b/cpuid_x86.c index 84c12ff43..44704fcd9 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1418,6 +1418,15 @@ int get_cpuname(void){ case 9: case 8: switch (model) { + case 12: // Tiger Lake + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; case 14: // Kaby Lake and refreshes if(support_avx2()) return CPUTYPE_HASWELL; @@ -1436,6 +1445,15 @@ int get_cpuname(void){ return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; + case 7: // Rocket Lake + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; } break; } @@ -2014,6 +2032,19 @@ int get_coretype(void){ #endif else return CORE_NEHALEM; + case 7:// Rocket Lake +#ifndef NO_AVX512 + if(support_avx512()) + return CORE_SKYLAKEX; +#endif +#ifndef NO_AVX2 + if(support_avx2()) + return CORE_HASWELL; +#endif + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; } case 5: switch (model) { @@ -2102,6 +2133,16 @@ int get_coretype(void){ break; case 9: case 8: + if (model == 12) { // Tiger Lake + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } if (model == 14) { // Kaby Lake if(support_avx()) #ifndef NO_AVX2 diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index 8aed9eb85..17f29fe69 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -5,9 +5,18 @@ enable_language(Fortran) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") +if(WIN32) +FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1 +"$ErrorActionPreference = \"Stop\"\n" +"Get-Content $args[1] | & $args[0]\n" +) +set(test_helper powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1") +else() FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh "$1 < $2\n" ) +set(test_helper sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh") +endif() foreach(float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char_upper) @@ -21,7 +30,7 @@ foreach(float_type ${FLOAT_TYPES}) c_${float_char}blas1.c) target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) add_test(NAME "x${float_char}cblat1" - COMMAND "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat1") + COMMAND $) #level2 add_executable(x${float_char}cblat2 @@ -33,7 +42,7 @@ foreach(float_type ${FLOAT_TYPES}) constant.c) target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) add_test(NAME "x${float_char}cblat2" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat2" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2") + COMMAND ${test_helper} $ "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2") #level3 add_executable(x${float_char}cblat3 @@ -45,6 +54,6 @@ foreach(float_type ${FLOAT_TYPES}) constant.c) target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) add_test(NAME "x${float_char}cblat3" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat3" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") + COMMAND ${test_helper} $ "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") endforeach() diff --git a/ctest/Makefile b/ctest/Makefile index 2a893cae8..15c83a907 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -212,6 +212,9 @@ ifeq ($(C_COMPILER), CLANG) CEXTRALIB = -lomp endif endif +ifeq ($(F_COMPILER), NAG) +CEXTRALIB = -lgomp +endif endif ifeq ($(BUILD_SINGLE),1) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 5e0943c2e..fa07a1ea4 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -1024,38 +1024,39 @@ int BLASFUNC(blas_thread_shutdown)(void){ int i; - if (!blas_server_avail) return 0; - LOCK_COMMAND(&server_lock); - for (i = 0; i < blas_num_threads - 1; i++) { + if (blas_server_avail) { + + for (i = 0; i < blas_num_threads - 1; i++) { - pthread_mutex_lock (&thread_status[i].lock); + pthread_mutex_lock (&thread_status[i].lock); - atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1); - thread_status[i].status = THREAD_STATUS_WAKEUP; - pthread_cond_signal (&thread_status[i].wakeup); + atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1); + thread_status[i].status = THREAD_STATUS_WAKEUP; + pthread_cond_signal (&thread_status[i].wakeup); - pthread_mutex_unlock(&thread_status[i].lock); + pthread_mutex_unlock(&thread_status[i].lock); - } + } - for(i = 0; i < blas_num_threads - 1; i++){ - pthread_join(blas_threads[i], NULL); - } + for(i = 0; i < blas_num_threads - 1; i++){ + pthread_join(blas_threads[i], NULL); + } - for(i = 0; i < blas_num_threads - 1; i++){ - pthread_mutex_destroy(&thread_status[i].lock); - pthread_cond_destroy (&thread_status[i].wakeup); - } + for(i = 0; i < blas_num_threads - 1; i++){ + pthread_mutex_destroy(&thread_status[i].lock); + pthread_cond_destroy (&thread_status[i].wakeup); + } #ifdef NEED_STACKATTR - pthread_attr_destory(&attr); + pthread_attr_destroy(&attr); #endif - blas_server_avail = 0; + blas_server_avail = 0; + } UNLOCK_COMMAND(&server_lock); return 0; diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 58f4d8b59..158e1b3da 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -644,6 +644,21 @@ static gotoblas_t *get_coretype(void){ return NULL; case 9: case 8: + if (model == 12) { // Tiger Lake + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); + return &gotoblas_HASWELL; + } + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } if (model == 14 ) { // Kaby Lake, Coffee Lake if(support_avx2()) return &gotoblas_HASWELL; @@ -656,7 +671,7 @@ static gotoblas_t *get_coretype(void){ } } case 10: - if (model == 5 || model == 6) { + if (model == 5 || model == 6) { if(support_avx2()) return &gotoblas_HASWELL; if(support_avx()) { @@ -666,7 +681,20 @@ static gotoblas_t *get_coretype(void){ openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } - } + } + if (model == 7) { + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } return NULL; } case 0xf: diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 4f1b12f27..6c68ba98a 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -43,6 +43,63 @@ #endif extern gotoblas_t gotoblas_ARMV8; +#ifdef DYNAMIC_LIST +#ifdef DYN_CORTEXA53 +extern gotoblas_t gotoblas_CORTEXA53; +#else +#define gotoblas_CORTEXA53 gotoblas_ARMV8 +#endif +#ifdef DYN_CORTEXA57 +extern gotoblas_t gotoblas_CORTEXA57; +#else +#define gotoblas_CORTEXA57 gotoblas_ARMV8 +#endif +#ifdef DYN_CORTEXA72 +extern gotoblas_t gotoblas_CORTEXA72; +#else +#define gotoblas_CORTEXA72 gotoblas_ARMV8 +#endif +#ifdef DYN_CORTEXA73 +extern gotoblas_t gotoblas_CORTEXA73; +#else +#define gotoblas_CORTEXA73 gotoblas_ARMV8 +#endif +#ifdef DYN_FALKOR +extern gotoblas_t gotoblas_FALKOR; +#else +#define gotoblas_FALKOR gotoblas_ARMV8 +#endif +#ifdef DYN_TSV110 +extern gotoblas_t gotoblas_TSV110; +#else +#define gotoblas_TSV110 gotoblas_ARMV8 +#endif +#ifdef DYN_THUNDERX +extern gotoblas_t gotoblas_THUNDERX; +#else +#define gotoblas_THUNDERX gotoblas_ARMV8 +#endif +#ifdef DYN_THUNDERX2T99 +extern gotoblas_t gotoblas_THUNDERX2T99; +#else +#define gotoblas_THUNDERX2T99 gotoblas_ARMV8 +#endif +#ifdef DYN_THUNDERX3T110 +extern gotoblas_t gotoblas_THUNDERX3T110; +#else +#define gotoblas_THUNDERX3T110 gotoblas_ARMV8 +#endif +#ifdef DYN_EMAG8180 +extern gotoblas_t gotoblas_EMAG8180; +#else +#define gotoblas_EMAG8180 gotoblas_ARMV8 +#endif +#ifdef DYN_NEOVERSEN1 +extern gotoblas_t gotoblas_NEOVERSEN1; +#else +#define gotoblas_NEOVERSEN1 gotoblas_ARMV8 +#endif +#else extern gotoblas_t gotoblas_CORTEXA53; extern gotoblas_t gotoblas_CORTEXA57; extern gotoblas_t gotoblas_CORTEXA72; @@ -54,6 +111,7 @@ extern gotoblas_t gotoblas_TSV110; extern gotoblas_t gotoblas_EMAG8180; extern gotoblas_t gotoblas_NEOVERSEN1; extern gotoblas_t gotoblas_THUNDERX3T110; +#endif extern void openblas_warning(int verbose, const char * msg); @@ -68,7 +126,7 @@ extern void openblas_warning(int verbose, const char * msg); #endif #define get_cpu_ftr(id, var) ({ \ - __asm__("mrs %0, "#id : "=r" (var)); \ + __asm__ ("mrs %0, "#id : "=r" (var)); \ }) static char *corename[] = { diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index a2f56d839..d9c15b312 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -27,7 +27,9 @@ static char *corename[] = { #define NUM_CORETYPES 4 char *gotoblas_corename(void) { +#ifndef C_PGI if (gotoblas == &gotoblas_POWER6) return corename[1]; +#endif if (gotoblas == &gotoblas_POWER8) return corename[2]; #if (!defined __GNUC__) || ( __GNUC__ >= 6) if (gotoblas == &gotoblas_POWER9) return corename[3]; @@ -38,10 +40,164 @@ char *gotoblas_corename(void) { return corename[0]; } +#if defined(__clang__) +static int __builtin_cpu_supports(char* arg) +{ + return 0; +} +#endif + +#if defined(C_PGI) || defined(__clang__) +/* + * NV HPC compilers do not yet implement __builtin_cpu_is(). + * Fake a version here for use in the CPU detection code below. + * + * Strategy here is to first check the CPU to see what it actually is, + * and then test the input to see if what the CPU actually is matches + * what was requested. + */ + +#include + +/* + * Define POWER processor version table. + * + * NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time + */ + +#define CPU_UNKNOWN 0 +#define CPU_POWER5 5 +#define CPU_POWER6 6 +#define CPU_POWER8 8 +#define CPU_POWER9 9 +#define CPU_POWER10 10 + +static struct { + uint32_t pvr_mask; + uint32_t pvr_value; + const char* cpu_name; + uint32_t cpu_type; +} pvrPOWER [] = { + + { /* POWER6 in P5+ mode; 2.04-compliant processor */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x0f000001, + .cpu_name = "POWER5+", + .cpu_type = CPU_POWER5, + }, + + { /* Power6 aka POWER6X*/ + .pvr_mask = 0xffff0000, + .pvr_value = 0x003e0000, + .cpu_name = "POWER6 (raw)", + .cpu_type = CPU_POWER6, + }, + + { /* Power7 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x003f0000, + .cpu_name = "POWER7 (raw)", + .cpu_type = CPU_POWER6, + }, + + { /* Power7+ */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004A0000, + .cpu_name = "POWER7+ (raw)", + .cpu_type = CPU_POWER6, + }, + + { /* Power8E */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004b0000, + .cpu_name = "POWER8E (raw)", + .cpu_type = CPU_POWER8, + }, + + { /* Power8NVL */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004c0000, + .cpu_name = "POWER8NVL (raw)", + .cpu_type = CPU_POWER8, + }, + + { /* Power8 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004d0000, + .cpu_name = "POWER8 (raw)", + .cpu_type = CPU_POWER8, + }, + + { /* Power9 DD2.0 */ + .pvr_mask = 0xffffefff, + .pvr_value = 0x004e0200, + .cpu_name = "POWER9 (raw)", + .cpu_type = CPU_POWER9, + }, + + { /* Power9 DD 2.1 */ + .pvr_mask = 0xffffefff, + .pvr_value = 0x004e0201, + .cpu_name = "POWER9 (raw)", + .cpu_type = CPU_POWER9, + }, + + { /* Power9 DD2.2 or later */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004e0000, + .cpu_name = "POWER9 (raw)", + .cpu_type = CPU_POWER9, + }, + + { /* Power10 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x00800000, + .cpu_name = "POWER10 (raw)", + .cpu_type = CPU_POWER10, + }, + + { /* End of table, pvr_mask and pvr_value must be zero */ + .pvr_mask = 0x0, + .pvr_value = 0x0, + .cpu_name = "Unknown", + .cpu_type = CPU_UNKNOWN, + }, +}; + +static int __builtin_cpu_is(const char *cpu) { + int i; + uint32_t pvr; + uint32_t cpu_type; + + asm("mfpvr %0" : "=r"(pvr)); + + for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) { + if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) { + break; + } + } + +#if defined(DEBUG) + printf("%s: returning CPU=%s, cpu_type=%p\n", __func__, + pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type); +#endif + cpu_type = pvrPOWER[i].cpu_type; + + if (!strcmp(cpu, "power8")) + return cpu_type == CPU_POWER8; + if (!strcmp(cpu, "power9")) + return cpu_type == CPU_POWER9; + return 0; +} + +#endif /* C_PGI */ + static gotoblas_t *get_coretype(void) { +#ifndef C_PGI if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x")) return &gotoblas_POWER6; +#endif if (__builtin_cpu_is("power8")) return &gotoblas_POWER8; #if (!defined __GNUC__) || ( __GNUC__ >= 6) @@ -53,7 +209,7 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_POWER10; #endif /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ -#if (!defined __GNUC__) || ( __GNUC__ >= 6) +#if (!defined __GNUC__) || ( __GNUC__ >= 11) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) if (__builtin_cpu_is("power10")) return &gotoblas_POWER9; #endif @@ -77,7 +233,9 @@ static gotoblas_t *force_coretype(char * coretype) { switch (found) { +#ifndef C_PGI case 1: return (&gotoblas_POWER6); +#endif case 2: return (&gotoblas_POWER8); #if (!defined __GNUC__) || ( __GNUC__ >= 6) case 3: return (&gotoblas_POWER9); diff --git a/driver/others/memory.c b/driver/others/memory.c index f0521ab2d..63fa6a566 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -222,11 +222,11 @@ int get_num_procs(void); #else int get_num_procs(void) { static int nums = 0; + +#if defined(__GLIBC_PREREQ) cpu_set_t cpuset,*cpusetp; size_t size; int ret; - -#if defined(__GLIBC_PREREQ) #if !__GLIBC_PREREQ(2, 7) int i; #if !__GLIBC_PREREQ(2, 6) @@ -1241,7 +1241,7 @@ UNLOCK_COMMAND(&alloc_lock); func = &memoryalloc[0]; - while ((func != NULL) && (map_address == (void *) -1)) { + while ((*func != NULL) && (map_address == (void *) -1)) { map_address = (*func)((void *)base_address); @@ -1619,10 +1619,12 @@ static int on_process_term(void) #else #pragma data_seg(".CRT$XLB") #endif -static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; + #ifdef _WIN64 +static const PIMAGE_TLS_CALLBACK dll_callback(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; #pragma const_seg() #else +static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; #pragma data_seg() #endif @@ -1631,10 +1633,12 @@ static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOI #else #pragma data_seg(".CRT$XTU") #endif -static int(*p_process_term)(void) = on_process_term; + #ifdef _WIN64 +static const int(*p_process_term)(void) = on_process_term; #pragma const_seg() #else +static int(*p_process_term)(void) = on_process_term; #pragma data_seg() #endif #endif @@ -1668,16 +1672,23 @@ void gotoblas_dummy_for_PGI(void) { #ifndef MEM_LARGE_PAGES #define MEM_LARGE_PAGES 0x20000000 #endif -#else +#elif !defined(OS_EMBEDDED) #define ALLOC_MMAP #define ALLOC_MALLOC +#else +#define ALLOC_MALLOC + +inline int puts(const char *str) { return 0; } +inline int printf(const char *format, ...) { return 0; } +inline char *getenv(const char *name) { return ""; } +inline int atoi(const char *str) { return 0; } #endif #include #include #include -#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT) +#if (!defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)) && !defined(OS_EMBEDDED) #include #ifndef NO_SYSV_IPC #include diff --git a/exports/gensymbol b/exports/gensymbol index 857a17a9e..e7210a030 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # Changelog # 2017/09/03 staticfloat diff --git a/f_check b/f_check index 42241ae10..20281ea4d 100644 --- a/f_check +++ b/f_check @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); @@ -32,9 +32,9 @@ if ($compiler eq "") { "xlf95", "xlf90", "xlf", "ppuf77", "ppuf95", "ppuf90", "ppuxlf", "pathf90", "pathf95", - "pgf95", "pgf90", "pgf77", + "pgf95", "pgf90", "pgf77", "pgfortran", "nvfortran", "flang", "egfortran", - "ifort"); + "ifort", "nagfor"); OUTER: foreach $lists (@lists) { @@ -64,7 +64,9 @@ if ($compiler eq "") { if (!$?) { $data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`; - + if ($data eq "") { + $data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.c && rm -f ftest.c`; + } if ($data =~ /zhoge_/) { $bu = "_"; } @@ -76,6 +78,7 @@ if ($compiler eq "") { } elsif ($data =~ /GNU/ || $data =~ /GCC/ ) { + $data =~ s/\(+.*?\)+//g; $data =~ /(\d+)\.(\d+).(\d+)/; $major = $1; $minor = $2; @@ -87,7 +90,7 @@ if ($compiler eq "") { if ($compiler =~ /flang/) { $vendor = FLANG; $openmp = "-fopenmp"; - } elsif ($compiler =~ /pgf/) { + } elsif ($compiler =~ /pgf/ || $compiler =~ /nvf/) { $vendor = PGI; $openmp = "-mp"; } else { @@ -123,7 +126,7 @@ if ($compiler eq "") { $openmp = "-mp"; } - if ($data =~ /PGF/) { + if ($data =~ /PGF/ || $data =~ /NVF/) { $vendor = PGI; $openmp = "-mp"; } @@ -133,8 +136,16 @@ if ($compiler eq "") { $openmp = "-openmp"; } + if ($data =~ /NAG/) { + $vendor = NAG; + $openmp = "-openmp"; + } + # for embedded underscore name, e.g. zho_ge, it may append 2 underscores. $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; + if ($data eq "") { + $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.c && rm -f ftest3.c`; + } if ($data =~ / zho_ge__/) { $need2bu = 1; } @@ -177,7 +188,7 @@ if ($compiler eq "") { $openmp = "-mp"; } - if ($compiler =~ /pgf/) { + if ($compiler =~ /pgf/ || $compiler =~ /nvf/) { $vendor = PGI; $bu = "_"; $openmp = "-mp"; @@ -222,6 +233,12 @@ if ($compiler eq "") { $openmp = "-fopenmp"; } + if ($compiler =~ /nagfor/) { + $vendor = NAG; + $bu = "_"; + $openmp = "-openmp"; + } + if ($vendor eq "") { $nofortran = 1; $compiler = "gfortran"; @@ -275,14 +292,20 @@ if (!$?) { if ($?) { $link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; } + #For nagfor + if ($?) { + $link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`; + } $binary = "" if ($?); } - if ($binary eq "") { $link = `$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`; } } +if ( $vendor eq "NAG") { + $link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`; + } $linker_L = ""; $linker_l = ""; $linker_a = ""; @@ -330,12 +353,13 @@ if ($link ne "") { $flags =~ s/\@/\,/g; $linker_L .= "-Wl,". $flags . " " ; } - if ($flags =~ /-lgomp/ && $CC =~ /clang/) { + if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) { $flags = "-lomp"; } if ( ($flags =~ /^\-l/) + && ($flags !~ /ibrary/) && ($flags !~ /gfortranbegin/) && ($flags !~ /frtbegin/) && ($flags !~ /pathfstart/) @@ -352,6 +376,16 @@ if ($link ne "") { $linker_l .= $flags . " "; } + if ( $flags =~ /quickfit.o/ && $vendor == NAG) { + $linker_l .= $flags . " "; + } + if ( $flags =~ /safefit.o/ && $vendor == NAG) { + $linker_l .= $flags . " "; + } + if ( $flags =~ /thsafe.o/ && $vendor == NAG) { + $linker_l .= $flags . " "; + } + $linker_a .= $flags . " " if $flags =~ /\.a$/; } diff --git a/getarch.c b/getarch.c index 29671736e..f48944f36 100644 --- a/getarch.c +++ b/getarch.c @@ -1375,6 +1375,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef __riscv #include "cpuid_riscv64.c" +#define OPENBLAS_SUPPORTED #endif #ifdef __arm__ diff --git a/getarch_2nd.c b/getarch_2nd.c index c390ef52c..53ecccf30 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -4,7 +4,7 @@ #else #include "config_kernel.h" #endif -#include "param.h" +#include "common.h" int main(int argc, char **argv) { diff --git a/interface/Makefile b/interface/Makefile index 597956fdb..3252601d2 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -316,7 +316,7 @@ CCBLAS1OBJS = \ cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ cblas_caxpby.$(SUFFIX) \ - cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) + cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) CCBLAS2OBJS = \ cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ @@ -346,7 +346,7 @@ CZBLAS1OBJS = \ cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ cblas_zaxpby.$(SUFFIX) \ - cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) + cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) CZBLAS2OBJS = \ @@ -1634,6 +1634,12 @@ cblas_srotg.$(SUFFIX) cblas_srotg.$(PSUFFIX): rotg.c cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) +cblas_crotg.$(SUFFIX) cblas_crotg.$(PSUFFIX): zrotg.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_zrotg.$(SUFFIX) cblas_zrotg.$(PSUFFIX): zrotg.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) @@ -1664,6 +1670,12 @@ cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) +cblas_csrot.$(SUFFIX) cblas_csrot.$(PSUFFIX) : zrot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_zdrot.$(SUFFIX) cblas_zdrot.$(PSUFFIX) : zrot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + ifeq ($(BUILD_BFLOAT16),1) cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) diff --git a/interface/create b/interface/create index b7be8ab6e..0b9cefa2b 100755 --- a/interface/create +++ b/interface/create @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl $count = 0; diff --git a/interface/gemm.c b/interface/gemm.c index 860e588fe..6fde69049 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -246,6 +246,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #ifdef SMP double MNK; +#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; @@ -264,6 +265,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #endif #endif #endif +#endif #if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3) int nodes; @@ -417,8 +419,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #ifdef SMP +#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) mode |= (transa << BLAS_TRANSA_SHIFT); mode |= (transb << BLAS_TRANSB_SHIFT); +#endif MNK = (double) args.m * (double) args.n * (double) args.k; if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) diff --git a/interface/rotmg.c b/interface/rotmg.c index ce3b146c1..3a5ca8f95 100644 --- a/interface/rotmg.c +++ b/interface/rotmg.c @@ -107,7 +107,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ dq1 = dp1 * *dx1; if(ABS(dq1) > ABS(dq2)) { - dflag = ZERO; dh11 = ONE; dh22 = ONE; dh21 = - dy1 / *dx1; diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 6d8d759ad..f0793bdef 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -187,10 +187,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) endif () # Makefile.L3 set(USE_TRMM false) - if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE)) + string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) + if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE)) set(USE_TRMM true) endif () - if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10)) + if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10)) set(USE_TRMM true) endif () diff --git a/kernel/Makefile b/kernel/Makefile index 4e86546b9..1a6c9413f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -36,7 +36,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE) ifeq ($(GCCVERSIONGTEQ10), 1) override CFLAGS += -march=cooperlake else - override CFLAGS += -march=skylake-avx512 + override CFLAGS += -march=skylake-avx512 -mavx512f endif ifeq ($(OSNAME), CYGWIN_NT) override CFLAGS += -fno-asynchronous-unwind-tables @@ -47,7 +47,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE) endif endif else ifeq ($(TARGET_CORE), SKYLAKEX) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 -mavx512f ifeq ($(OSNAME), CYGWIN_NT) override CFLAGS += -fno-asynchronous-unwind-tables endif diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 index 7ad94118a..09337363d 100644 --- a/kernel/Makefile.L1 +++ b/kernel/Makefile.L1 @@ -1,3 +1,11 @@ +FMAFLAG= +ifndef OLDGCC +ifdef HAVE_FMA3 +FMAFLAG = -mfma +endif +endif + + ### AMAX ### ifndef SAMAXKERNEL @@ -828,10 +836,10 @@ $(KDIR)xnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)xnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KE $(CC) $(CFLAGS) -DCOMPLEX -c -DXDOUBLE $< -o $@ $(KDIR)srot_k$(TSUFFIX).$(SUFFIX) $(KDIR)srot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ $(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ $(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ diff --git a/kernel/arm/omatcopy_rt.c b/kernel/arm/omatcopy_rt.c index 9d58350d5..3d90ac6e4 100644 --- a/kernel/arm/omatcopy_rt.c +++ b/kernel/arm/omatcopy_rt.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project +Copyright (c) 2021, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,36 +27,208 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -/***************************************************** - * 2014/06/09 Saar - * - * Order rowMajor - * Trans - * -******************************************************/ - int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) { - BLASLONG i,j; - FLOAT *aptr,*bptr; + BLASLONG i, j; + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4; - if ( rows <= 0 ) return(0); - if ( cols <= 0 ) return(0); + if (rows <= 0) return 0; + if (cols <= 0) return 0; - aptr = a; + a_offset = a; + b_offset = b; - for ( i=0; i> 2); + if (i > 0) { + do { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; - return(0); + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + b_offset += 4; + j = (cols >> 2); + if (j > 0) { + do { + /* Column 1 of MAT_B */ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + /* Column 2 of MAT_B */ + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; + *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; + + /* Column 3 of MAT_B */ + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A + *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; + *(b_offset3 + 2) = *(a_offset3 + 2)*alpha; + *(b_offset4 + 2) = *(a_offset3 + 3)*alpha; + + /* Column 4 of MAT_B */ + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A + *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; + *(b_offset3 + 3) = *(a_offset4 + 2)*alpha; + *(b_offset4 + 3) = *(a_offset4 + 3)*alpha; + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } // if(j > 0) + + + if (cols & 2) { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; + *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; + + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; + *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + b_offset1 += ldb*2; + + } + + if (cols & 1) { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; + + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; + } + + i--; + } while (i > 0); + } + + + if (rows & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + b_offset += 2; + + j = (cols >> 2); + if (j > 0){ + do { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; + *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; + + a_offset1 += 4; + a_offset2 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } + + + if (cols & 2){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + + a_offset1 += 2; + a_offset2 += 2; + b_offset1 += ldb*2; + + } + + + if (cols & 1){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + } + } // if (rows & 2) + + + if (rows & 1) { + a_offset1 = a_offset; + a_offset += lda; + + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + + j = (cols >> 2); + if (j > 0){ + do { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + a_offset1 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } + + if (cols & 2){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + a_offset1 += 2; + b_offset1 += ldb * 2; + } + + if (cols & 1){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + } + } + + return 0; } - diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c index 9249b54f8..79baa61b1 100644 --- a/kernel/arm/zdot.c +++ b/kernel/arm/zdot.c @@ -48,7 +48,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA dot[0]=0.0; dot[1]=0.0; -#if !defined(__PPC__) && !defined(__SunOS) +#if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI) CREAL(result) = 0.0 ; CIMAG(result) = 0.0 ; #else @@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA i++ ; } -#if !defined(__PPC__) && !defined(__SunOS) +#if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI) CREAL(result) = dot[0]; CIMAG(result) = dot[1]; #else diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index 603e47d87..c8a53c86b 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -97,9 +97,18 @@ CNRM2KERNEL = znrm2.S ZNRM2KERNEL = znrm2.S DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +ifneq ($(C_COMPILER), PGI) CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif DSDOTKERNEL = dot.S DGEMM_BETA = dgemm_beta.S diff --git a/kernel/arm64/KERNEL.CORTEXA53 b/kernel/arm64/KERNEL.CORTEXA53 index e23133e52..db322dd0d 100644 --- a/kernel/arm64/KERNEL.CORTEXA53 +++ b/kernel/arm64/KERNEL.CORTEXA53 @@ -96,11 +96,20 @@ DNRM2KERNEL = nrm2.S CNRM2KERNEL = znrm2.S ZNRM2KERNEL = znrm2.S -DDOTKERNEL = dot.S -SDOTKERNEL = ../generic/dot.c -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -DSDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S DGEMM_BETA = dgemm_beta.S SGEMM_BETA = sgemm_beta.S diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57 index dcf2383a9..0be334893 100644 --- a/kernel/arm64/KERNEL.CORTEXA57 +++ b/kernel/arm64/KERNEL.CORTEXA57 @@ -70,10 +70,19 @@ DCOPYKERNEL = copy.S CCOPYKERNEL = copy.S ZCOPYKERNEL = copy.S +ifneq ($(C_COMPILER), PGI) SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif DSDOTKERNEL = dot.S SNRM2KERNEL = nrm2.S diff --git a/kernel/arm64/KERNEL.THUNDERX b/kernel/arm64/KERNEL.THUNDERX index cb02c7bc5..669f62698 100644 --- a/kernel/arm64/KERNEL.THUNDERX +++ b/kernel/arm64/KERNEL.THUNDERX @@ -47,8 +47,13 @@ ZCOPYKERNEL = copy.S SDOTKERNEL = dot_thunderx.c DDOTKERNEL = ddot_thunderx.c +ifneq ($(C_COMPILER), PGI) CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif DSDOTKERNEL = dot.S SNRM2KERNEL = nrm2.S diff --git a/kernel/arm64/KERNEL.TSV110 b/kernel/arm64/KERNEL.TSV110 index 1ce7bb7c0..54d016e17 100644 --- a/kernel/arm64/KERNEL.TSV110 +++ b/kernel/arm64/KERNEL.TSV110 @@ -72,8 +72,13 @@ ZCOPYKERNEL = copy.S SDOTKERNEL = dot.S DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif DSDOTKERNEL = dot.S SNRM2KERNEL = nrm2.S diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c index b94f0cffc..b021a2832 100644 --- a/kernel/arm64/dznrm2_thunderx2t99.c +++ b/kernel/arm64/dznrm2_thunderx2t99.c @@ -58,6 +58,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n #define CUR_MAXINV "d8" #define CUR_MAXINV_V "v8.2d" #define CUR_MAX_V "v8.2d" +#define REGINF "d9" static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, double *ssq, double *scale) @@ -79,8 +80,10 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " ble 9f //nrm2_kernel_L999 \n" "1: //nrm2_kernel_F_BEGIN: \n" + " mov x6, #0x7FF0000000000000 //+Infinity \n" " fmov "REGZERO", xzr \n" " fmov "REGONE", #1.0 \n" + " fmov "REGINF", x6 \n" " lsl "INC_X", "INC_X", #"INC_SHIFT" \n" " mov "J", "N" \n" " cmp "J", xzr \n" @@ -104,6 +107,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " ldr d4, ["X"] \n" " fabs d4, d4 \n" " fmax "CUR_MAX", "SCALE", d4 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" " fmul "SCALE", "SCALE", "SCALE" \n" " fmul "SSQ", "SSQ", "SCALE" \n" @@ -116,6 +121,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " ldr d3, ["X", #8] \n" " fabs d3, d3 \n" " fmax "CUR_MAX", "SCALE", d3 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" " fmul "SCALE", "SCALE", "SCALE" \n" " fmul "SSQ", "SSQ", "SCALE" \n" @@ -158,6 +165,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " fmaxp v24.2d, v24.2d, v26.2d \n" " fmaxp v24.2d, v24.2d, v24.2d \n" " fmax "CUR_MAX", "SCALE", d24 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n" " //dup "CUR_MAX_V", v7.d[0] \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" @@ -217,6 +226,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " fmaxp v24.2d, v24.2d, v26.2d \n" " fmaxp v24.2d, v24.2d, v24.2d \n" " fmax "CUR_MAX", "SCALE", d24 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n" " //dup "CUR_MAX_V", v7.d[0] \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" @@ -265,6 +276,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " ldr d4, ["X"] \n" " fabs d4, d4 \n" " fmax "CUR_MAX", "SCALE", d4 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" " fmul "SCALE", "SCALE", "SCALE" \n" " fmul "SSQ", "SSQ", "SCALE" \n" @@ -276,6 +289,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " ldr d3, ["X", #8] \n" " fabs d3, d3 \n" " fmax "CUR_MAX", "SCALE", d3 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" " fmul "SCALE", "SCALE", "SCALE" \n" " fmul "SSQ", "SSQ", "SCALE" \n" @@ -291,6 +306,11 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, "9: //nrm2_kernel_L999: \n" " str "SSQ", [%[SSQ_]] \n" " str "SCALE", [%[SCALE_]] \n" + " b 11f \n" + "10: \n" + " str "REGINF", [%[SSQ_]] \n" + " str "REGINF", [%[SCALE_]] \n" + "11: \n" : : [SSQ_] "r" (ssq), //%0 @@ -300,7 +320,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, [INCX_] "r" (inc_x) //%4 : "cc", "memory", - "x0", "x1", "x2", "x3", "x4", "x5", + "x0", "x1", "x2", "x3", "x4", "x5", "x6", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8" ); @@ -359,6 +379,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) cur_ssq = *ptr; cur_scale = *(ptr + 1); + if (cur_ssq == INFINITY) { + ssq = INFINITY; + scale = INFINITY; + break; + } + if (cur_scale != 0) { if (cur_scale > scale) { scale = (scale / cur_scale); diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index d61f5194a..1cf7b0b7c 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -154,11 +154,7 @@ ZCOPYKERNEL = zcopy_power10.c SDOTKERNEL = sdot_power10.c DDOTKERNEL = ddot_power10.c DSDOTKERNEL = sdot_power10.c -ifneq ($(GCCVERSIONGTEQ9),1) -CDOTKERNEL = cdot_power9.S -else CDOTKERNEL = cdot.c -endif ZDOTKERNEL = zdot.c # SNRM2KERNEL = ../arm/nrm2.c diff --git a/kernel/power/ccopy_microk_power10.c b/kernel/power/ccopy_microk_power10.c new file mode 100644 index 000000000..6c80f9cd4 --- /dev/null +++ b/kernel/power/ccopy_microk_power10.c @@ -0,0 +1,115 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL 1 + +static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ + ( + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" + "lxvp 38, 96(%2) \n\t" + "lxvp 40, 128(%2) \n\t" + "lxvp 42, 160(%2) \n\t" + "lxvp 44, 192(%2) \n\t" + "lxvp 46, 224(%2) \n\t" + + "addi %2, %2, 256 \n\t" + "addic. %1, %1, -32 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "stxv 33, 0(%3) \n\t" + "stxv 32, 16(%3) \n\t" + "stxv 35, 32(%3) \n\t" + "stxv 34, 48(%3) \n\t" + "stxv 37, 64(%3) \n\t" + "stxv 36, 80(%3) \n\t" + "stxv 39, 96(%3) \n\t" + "stxv 38, 112(%3) \n\t" + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" + "lxvp 38, 96(%2) \n\t" + + "stxv 41, 128(%3) \n\t" + "stxv 40, 144(%3) \n\t" + "stxv 43, 160(%3) \n\t" + "stxv 42, 176(%3) \n\t" + "stxv 45, 192(%3) \n\t" + "stxv 44, 208(%3) \n\t" + "stxv 47, 224(%3) \n\t" + "stxv 46, 240(%3) \n\t" + "lxvp 40, 128(%2) \n\t" + "lxvp 42, 160(%2) \n\t" + "lxvp 44, 192(%2) \n\t" + "lxvp 46, 224(%2) \n\t" + + + "addi %3, %3, 256 \n\t" + "addi %2, %2, 256 \n\t" + + "addic. %1, %1, -32 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "stxv 33, 0(%3) \n\t" + "stxv 32, 16(%3) \n\t" + "stxv 35, 32(%3) \n\t" + "stxv 34, 48(%3) \n\t" + "stxv 37, 64(%3) \n\t" + "stxv 36, 80(%3) \n\t" + "stxv 39, 96(%3) \n\t" + "stxv 38, 112(%3) \n\t" + "stxv 41, 128(%3) \n\t" + "stxv 40, 144(%3) \n\t" + "stxv 43, 160(%3) \n\t" + "stxv 42, 176(%3) \n\t" + "stxv 45, 192(%3) \n\t" + "stxv 44, 208(%3) \n\t" + "stxv 47, 224(%3) \n\t" + "stxv 46, 240(%3) \n\t" + + "#n=%1 x=%4=%2 y=%0=%3" + : + "=m" (*y), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "m" (*x) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" + ); +} diff --git a/kernel/power/ccopy_power10.c b/kernel/power/ccopy_power10.c index a5877cd12..41c510460 100644 --- a/kernel/power/ccopy_power10.c +++ b/kernel/power/ccopy_power10.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(__VEC__) || defined(__ALTIVEC__) -#include "copy_microk_power10.c" +#include "ccopy_microk_power10.c" #endif #ifndef HAVE_KERNEL @@ -86,7 +86,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1 )) { - BLASLONG n1 = n & -64; + BLASLONG n1 = n & -32; if ( n1 > 0 ) { copy_kernel(n1, x, y); diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c index ef5e4710f..c53fe0c02 100644 --- a/kernel/power/cdot.c +++ b/kernel/power/cdot.c @@ -28,6 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #include "common.h" +#if defined(POWER10) +#include "cdot_microk_power10.c" +#else #ifndef HAVE_KERNEL_8 #include @@ -99,6 +102,7 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) } #endif +#endif OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { @@ -116,7 +120,11 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA if ((inc_x == 1) && (inc_y == 1)) { +#if defined(POWER10) + BLASLONG n1 = n & -16; +#else BLASLONG n1 = n & -8; +#endif BLASLONG j=0; if (n1){ diff --git a/kernel/power/cdot_microk_power10.c b/kernel/power/cdot_microk_power10.c new file mode 100644 index 000000000..399f2b180 --- /dev/null +++ b/kernel/power/cdot_microk_power10.c @@ -0,0 +1,177 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void cdot_kernel_8 (long n, float *x, float *y, float *dot) +{ + __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; + __asm__ + ( + "dcbt 0, %2 \n\t" + "dcbt 0, %3 \n\t" + + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + "xxlxor 34, 34, 34 \n\t" + "xxlxor 35, 35, 35 \n\t" + "xxlxor 36, 36, 36 \n\t" + "xxlxor 37, 37, 37 \n\t" + "xxlxor 38, 38, 38 \n\t" + "xxlxor 39, 39, 39 \n\t" + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + "lxvp 48, 0(%3) \n\t" + "lxvp 50, 32(%3) \n\t" + "lxvp 52, 64(%3) \n\t" + "lxvp 54, 96(%3) \n\t" + + "xxperm 56, 48, %x7 \n\t" + "xxperm 57, 49, %x7 \n\t" + "xxperm 58, 50, %x7 \n\t" + "xxperm 59, 51, %x7 \n\t" + + "xxperm 60, 52, %x7 \n\t" + "xxperm 61, 53, %x7 \n\t" + "xxperm 62, 54, %x7 \n\t" + "xxperm 63, 55, %x7 \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i + "xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i + "lxvp 48, 0(%3) \n\t" + + "xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i + "xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i + "lxvp 50, 32(%3) \n\t" + + "xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r + "xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r + "lxvp 40, 0(%2) \n\t" + + "xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r + "xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r + "lxvp 42, 32(%2) \n\t" + + "xxperm 56, 48, %x7 \n\t" + "xxperm 57, 49, %x7 \n\t" + "xxperm 58, 50, %x7 \n\t" + "xxperm 59, 51, %x7 \n\t" + + "xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i + "xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i + "lxvp 52, 64(%3) \n\t" + + "xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i + "xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i + "lxvp 54, 96(%3) \n\t" + + "xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r + "xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r + "lxvp 44, 64(%2) \n\t" + "xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r + "xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r + "lxvp 46, 96(%2) \n\t" + + "xxperm 60, 52, %x7 \n\t" + "xxperm 61, 53, %x7 \n\t" + "xxperm 62, 54, %x7 \n\t" + "xxperm 63, 55, %x7 \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i + "xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i + "xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i + "xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i + + "xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r + "xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r + "xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r + "xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r + + "xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i + "xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i + "xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i + "xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i + + "xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r + "xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r + "xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r + "xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r + + "xvaddsp 32, 32, 34 \n\t" + "xvaddsp 36, 36, 38 \n\t" + + "xvaddsp 33, 33, 35 \n\t" + "xvaddsp 37, 37, 39 \n\t" + + "xvaddsp 35, 32, 36 \n\t" + "xvaddsp 34, 33, 37 \n\t" + "xxswapd 32, 35 \n\t" + "xxswapd 33, 34 \n\t" + "xvaddsp 35, 35, 32 \n\t" + "xvaddsp 34, 34, 33 \n\t" + "xxpermdi 34, 34, 35, 2 \n\t" + "stxv 34, 0(%6) \n\t" + + "#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6" + : + "=m" (*dot), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "m" (*x), + "m" (*y), + "b" (dot), // 6 + "wa" (mask) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" + ); +} diff --git a/kernel/power/copy_microk_power10.c b/kernel/power/copy_microk_power10.c index c90dc3785..8bca1a1e7 100644 --- a/kernel/power/copy_microk_power10.c +++ b/kernel/power/copy_microk_power10.c @@ -62,38 +62,39 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) "one%=: \n\t" "stxvp 32, 0(%3) \n\t" - "lxvp 32, 0(%2) \n\t" "stxvp 34, 32(%3) \n\t" - "lxvp 34, 32(%2) \n\t" "stxvp 36, 64(%3) \n\t" - "lxvp 36, 64(%2) \n\t" "stxvp 38, 96(%3) \n\t" + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" "lxvp 38, 96(%2) \n\t" "stxvp 40, 128(%3) \n\t" - "lxvp 40, 128(%2) \n\t" "stxvp 42, 160(%3) \n\t" - "lxvp 42, 160(%2) \n\t" "stxvp 44, 192(%3) \n\t" - "lxvp 44, 192(%2) \n\t" "stxvp 46, 224(%3) \n\t" + "lxvp 40, 128(%2) \n\t" + "lxvp 42, 160(%2) \n\t" + "lxvp 44, 192(%2) \n\t" "lxvp 46, 224(%2) \n\t" "stxvp 48, 256(%3) \n\t" - "lxvp 48, 256(%2) \n\t" "stxvp 50, 288(%3) \n\t" - "lxvp 50, 288(%2) \n\t" "stxvp 52, 320(%3) \n\t" - "lxvp 52, 320(%2) \n\t" "stxvp 54, 352(%3) \n\t" + "lxvp 48, 256(%2) \n\t" + "lxvp 50, 288(%2) \n\t" + "lxvp 52, 320(%2) \n\t" "lxvp 54, 352(%2) \n\t" + "stxvp 56, 384(%3) \n\t" - "lxvp 56, 384(%2) \n\t" "stxvp 58, 416(%3) \n\t" - "lxvp 58, 416(%2) \n\t" "stxvp 60, 448(%3) \n\t" - "lxvp 60, 448(%2) \n\t" "stxvp 62, 480(%3) \n\t" + "lxvp 56, 384(%2) \n\t" + "lxvp 58, 416(%2) \n\t" + "lxvp 60, 448(%2) \n\t" "lxvp 62, 480(%2) \n\t" "addi %3, %3, 512 \n\t" diff --git a/kernel/power/cscal_microk_power10.c b/kernel/power/cscal_microk_power10.c new file mode 100644 index 000000000..70b50809e --- /dev/null +++ b/kernel/power/cscal_microk_power10.c @@ -0,0 +1,176 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i) +{ + __vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i}; + __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; + __asm__ + ( + "dcbt 0, %2 \n\t" + "xscvdpspn 32, %x3 \n\t" + "xxspltw 32, 32, 0 \n\t" + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + + "addic. %1, %1, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmulsp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r + "xvmulsp 49, 41, 32 \n\t" + "xvmulsp 50, 42, 32 \n\t" + "xvmulsp 51, 43, 32 \n\t" + "xvmulsp 52, 44, 32 \n\t" + "xvmulsp 53, 45, 32 \n\t" + "xvmulsp 54, 46, 32 \n\t" + "xvmulsp 55, 47, 32 \n\t" + + "xxperm 34, 40, %x5 \n\t" + "xxperm 35, 41, %x5 \n\t" + "xxperm 36, 42, %x5 \n\t" + "xxperm 37, 43, %x5 \n\t" + "xxperm 38, 44, %x5 \n\t" + "xxperm 39, 45, %x5 \n\t" + "xxperm 56, 46, %x5 \n\t" + "xxperm 57, 47, %x5 \n\t" + + "xvmulsp 34, 34, %x4 \n\t" // x0_i * -alpha_i, x0_r * alpha_i + "xvmulsp 35, 35, %x4 \n\t" + + "lxvp 40, 128(%2) \n\t" + + "xvmulsp 36, 36, %x4 \n\t" + "xvmulsp 37, 37, %x4 \n\t" + + "lxvp 42, 160(%2) \n\t" + + "xvmulsp 38, 38, %x4 \n\t" + "xvmulsp 39, 39, %x4 \n\t" + + "lxvp 44, 192(%2) \n\t" + + "xvmulsp 56, 56, %x4 \n\t" + "xvmulsp 57, 57, %x4 \n\t" + + "lxvp 46, 224(%2) \n\t" + + "xvaddsp 48, 48, 34 \n\t" + "xvaddsp 49, 49, 35 \n\t" + "xvaddsp 50, 50, 36 \n\t" + "xvaddsp 51, 51, 37 \n\t" + + "stxvp 48, 0(%2) \n\t" + + "xvaddsp 52, 52, 38 \n\t" + "xvaddsp 53, 53, 39 \n\t" + + "stxvp 50, 32(%2) \n\t" + + "xvaddsp 54, 54, 56 \n\t" + "xvaddsp 55, 55, 57 \n\t" + + "stxvp 52, 64(%2) \n\t" + "stxvp 54, 96(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmulsp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r + "xvmulsp 49, 41, 32 \n\t" + "xvmulsp 50, 42, 32 \n\t" + "xvmulsp 51, 43, 32 \n\t" + "xvmulsp 52, 44, 32 \n\t" + "xvmulsp 53, 45, 32 \n\t" + "xvmulsp 54, 46, 32 \n\t" + "xvmulsp 55, 47, 32 \n\t" + + "xxperm 34, 40, %x5 \n\t" + "xxperm 35, 41, %x5 \n\t" + "xxperm 36, 42, %x5 \n\t" + "xxperm 37, 43, %x5 \n\t" + "xxperm 38, 44, %x5 \n\t" + "xxperm 39, 45, %x5 \n\t" + "xxperm 56, 46, %x5 \n\t" + "xxperm 57, 47, %x5 \n\t" + + + "xvmulsp 34, 34, %x4 \n\t" // x0_i * -alpha_i, x0_r * alpha_i + "xvmulsp 35, 35, %x4 \n\t" + "xvmulsp 36, 36, %x4 \n\t" + "xvmulsp 37, 37, %x4 \n\t" + "xvmulsp 38, 38, %x4 \n\t" + "xvmulsp 39, 39, %x4 \n\t" + "xvmulsp 56, 56, %x4 \n\t" + "xvmulsp 57, 57, %x4 \n\t" + + "xvaddsp 48, 48, 34 \n\t" + "xvaddsp 49, 49, 35 \n\t" + "xvaddsp 50, 50, 36 \n\t" + "xvaddsp 51, 51, 37 \n\t" + + "stxvp 48, 0(%2) \n\t" + + "xvaddsp 52, 52, 38 \n\t" + "xvaddsp 53, 53, 39 \n\t" + + "stxvp 50, 32(%2) \n\t" + + "xvaddsp 54, 54, 56 \n\t" + "xvaddsp 55, 55, 57 \n\t" + + "stxvp 52, 64(%2) \n\t" + "stxvp 54, 96(%2) \n\t" + + "#n=%1 x=%0=%2 alpha=(%3,%4)\n" + : + "+m" (*x), + "+r" (n), // 1 + "+b" (x) // 2 + : + "f" (alpha_r), // 3 + "wa" (t0), // 4 + "wa" (mask) // 5 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57" + ); +} diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c index 5144a2e93..4d9b9ccd6 100644 --- a/kernel/power/cswap.c +++ b/kernel/power/cswap.c @@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "cswap_microk_power8.c" +#elif defined(POWER10) +#include "cswap_microk_power10.c" #endif #endif diff --git a/kernel/power/cswap_microk_power10.c b/kernel/power/cswap_microk_power10.c new file mode 100644 index 000000000..2a44a9e30 --- /dev/null +++ b/kernel/power/cswap_microk_power10.c @@ -0,0 +1,127 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#if defined(DOUBLE) +#define HAVE_KERNEL_16 1 +static void zswap_kernel_16 (long n, double *x, double *y) +#else +#define HAVE_KERNEL_32 1 +static void cswap_kernel_32 (long n, float *x, float *y) +#endif +{ + __asm__ + ( + ".align 5 \n" + "one%=: \n\t" + "lxvp 32, 0(%4) \n\t" + "lxvp 34, 32(%4) \n\t" + "lxvp 36, 64(%4) \n\t" + "lxvp 38, 96(%4) \n\t" + + "lxvp 40, 128(%4) \n\t" + "lxvp 42, 160(%4) \n\t" + "lxvp 44, 192(%4) \n\t" + "lxvp 46, 224(%4) \n\t" + + "lxvp 48, 0(%3) \n\t" + "lxvp 50, 32(%3) \n\t" + "lxvp 52, 64(%3) \n\t" + "lxvp 54, 96(%3) \n\t" + + "lxvp 56, 128(%3) \n\t" + "lxvp 58, 160(%3) \n\t" + "lxvp 60, 192(%3) \n\t" + "lxvp 62, 224(%3) \n\t" + + + "stxv 33, 0(%3) \n\t" + "stxv 32, 16(%3) \n\t" + "stxv 35, 32(%3) \n\t" + "stxv 34, 48(%3) \n\t" + "stxv 37, 64(%3) \n\t" + "stxv 36, 80(%3) \n\t" + "stxv 39, 96(%3) \n\t" + "stxv 38, 112(%3) \n\t" + + "addi %3, %3, 128 \n\t" + + "stxv 41, 0(%3) \n\t" + "stxv 40, 16(%3) \n\t" + "stxv 43, 32(%3) \n\t" + "stxv 42, 48(%3) \n\t" + "stxv 45, 64(%3) \n\t" + "stxv 44, 80(%3) \n\t" + "stxv 47, 96(%3) \n\t" + "stxv 46, 112(%3) \n\t" + + "addi %3, %3, 128 \n\t" + + "stxv 49, 0(%4) \n\t" + "stxv 48, 16(%4) \n\t" + "stxv 51, 32(%4) \n\t" + "stxv 50, 48(%4) \n\t" + "stxv 53, 64(%4) \n\t" + "stxv 52, 80(%4) \n\t" + "stxv 55, 96(%4) \n\t" + "stxv 54, 112(%4) \n\t" + + "addi %4, %4, 128 \n\t" + + "stxv 57, 0(%4) \n\t" + "stxv 56, 16(%4) \n\t" + "stxv 59, 32(%4) \n\t" + "stxv 58, 48(%4) \n\t" + "stxv 61, 64(%4) \n\t" + "stxv 60, 80(%4) \n\t" + "stxv 63, 96(%4) \n\t" + "stxv 62, 112(%4) \n\t" + + "addi %4, %4, 128 \n\t" + +#if defined(DOUBLE) + "addic. %2, %2, -16 \n\t" +#else + "addic. %2, %2, -32 \n\t" +#endif + "bgt one%= \n" + + "#n=%2 x=%0=%3 y=%1=%4" + : + "+m" (*x), + "+m" (*y), + "+r" (n), // 2 + "+b" (x), // 3 + "+b" (y) // 4 + : + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" + ); +} diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c index 999dc677a..0cdec3292 100644 --- a/kernel/power/dasum.c +++ b/kernel/power/dasum.c @@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "dasum_microk_power8.c" +#elif defined(POWER10) +#include "dasum_microk_power10.c" #endif #endif @@ -110,6 +112,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( inc_x == 1 ) { +#if defined(POWER10) + if ( n >= 16 ) + { + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; + for (i = 0; i < align; i++) { + sumf += ABS(x[i]); + } + } + n1 = (n-i) & -16; + if ( n1 > 0 ) + { + sumf += dasum_kernel_16(n1, &x[i]); + i+=n1; + } +#else n1 = n & -16; if ( n1 > 0 ) { @@ -117,6 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sumf = dasum_kernel_16(n1, x); i=n1; } +#endif while(i < n) { diff --git a/kernel/power/dasum_microk_power10.c b/kernel/power/dasum_microk_power10.c new file mode 100644 index 000000000..d1a21b4d1 --- /dev/null +++ b/kernel/power/dasum_microk_power10.c @@ -0,0 +1,152 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static double dasum_kernel_16 (long n, double *x) +{ + double sum; + __vector double t0; + __vector double t1; + __vector double t2; + __vector double t3; + + __asm__ + ( + "dcbt 0, %2 \n\t" + + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + "xxlxor 34, 34, 34 \n\t" + "xxlxor 35, 35, 35 \n\t" + "xxlxor 36, 36, 36 \n\t" + "xxlxor 37, 37, 37 \n\t" + "xxlxor 38, 38, 38 \n\t" + "xxlxor 39, 39, 39 \n\t" + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvabsdp 48, 40 \n\t" + "xvabsdp 49, 41 \n\t" + "xvabsdp 50, 42 \n\t" + "xvabsdp 51, 43 \n\t" + "lxvp 40, 0(%2) \n\t" + + + "xvabsdp %x3, 44 \n\t" + "xvabsdp %x4, 45 \n\t" + "lxvp 42, 32(%2) \n\t" + + + "xvabsdp %x5, 46 \n\t" + "xvabsdp %x6, 47 \n\t" + "lxvp 44, 64(%2) \n\t" + + + "xvadddp 32, 32, 48 \n\t" + "xvadddp 33, 33, 49 \n\t" + + "lxvp 46, 96(%2) \n\t" + + "xvadddp 34, 34, 50 \n\t" + "xvadddp 35, 35, 51 \n\t" + "addi %2, %2, 128 \n\t" + "xvadddp 36, 36, %x3 \n\t" + "xvadddp 37, 37, %x4 \n\t" + "addic. %1, %1, -16 \n\t" + "xvadddp 38, 38, %x5 \n\t" + "xvadddp 39, 39, %x6 \n\t" + + "bgt one%= \n" + + "two%=: \n\t" + + "xvabsdp 48, 40 \n\t" + "xvabsdp 49, 41 \n\t" + "xvabsdp 50, 42 \n\t" + "xvabsdp 51, 43 \n\t" + "xvabsdp %x3, 44 \n\t" + "xvabsdp %x4, 45 \n\t" + "xvabsdp %x5, 46 \n\t" + "xvabsdp %x6, 47 \n\t" + + "xvadddp 32, 32, 48 \n\t" + "xvadddp 33, 33, 49 \n\t" + "xvadddp 34, 34, 50 \n\t" + "xvadddp 35, 35, 51 \n\t" + "xvadddp 36, 36, %x3 \n\t" + "xvadddp 37, 37, %x4 \n\t" + "xvadddp 38, 38, %x5 \n\t" + "xvadddp 39, 39, %x6 \n\t" + + "xvadddp 32, 32, 33 \n\t" + "xvadddp 34, 34, 35 \n\t" + "xvadddp 36, 36, 37 \n\t" + "xvadddp 38, 38, 39 \n\t" + + "xvadddp 32, 32, 34 \n\t" + "xvadddp 36, 36, 38 \n\t" + + "xvadddp 32, 32, 36 \n\t" + + XXSWAPD_S(33,32) + "xsadddp %x0, 32, 33 \n" + + "#n=%1 x=%3=%2 sum=%0\n" + "#t0=%x3 t1=%x4 t2=%x5 t3=%x6" + : + "=d" (sum), // 0 + "+r" (n), // 1 + "+b" (x), // 2 + "=wa" (t0), // 3 + "=wa" (t1), // 4 + "=wa" (t2), // 5 + "=wa" (t3) // 6 + : + "m" (*x) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); + + return sum; +} + + diff --git a/kernel/power/dcopy_power10.c b/kernel/power/dcopy_power10.c index cd10b7136..6c5eb4d77 100644 --- a/kernel/power/dcopy_power10.c +++ b/kernel/power/dcopy_power10.c @@ -85,12 +85,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1 )) { - - BLASLONG n1 = n & -64; - if ( n1 > 0 ) + if ( n >= 64 ) { - copy_kernel(n1, x, y); - i=n1; + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; + for (i = 0; i < align; i++) { + y[i] = x[i] ; + } + } + BLASLONG n1 = (n-i) & -64; + if ( n1 ) + { + copy_kernel(n1, &x[i], &y[i]); + i += n1; } while(i < n) diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c index b531799a6..e918e61c3 100644 --- a/kernel/power/dgemm_kernel_power10.c +++ b/kernel/power/dgemm_kernel_power10.c @@ -29,7 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); -typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif + +#if !__has_builtin(__builtin_vsx_disassemble_pair) +#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair +#endif #ifdef TRMMKERNEL #define SAVE_ACC(ACC, J) \ @@ -186,8 +192,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, vec_t *rowA = (vec_t *) & AO[0]; vec_t *rb = (vec_t *) & BO[0]; __vector_pair rowB, rowB1; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); @@ -200,8 +206,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, { rowA = (vec_t *) & AO[l << 3]; rb = (vec_t *) & BO[l << 3]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); @@ -242,8 +248,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB, rowB1; vec_t *rb = (vec_t *) & BO[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); @@ -252,8 +258,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, { rowA = (vec_t *) & AO[l << 2]; rb = (vec_t *) & BO[l << 3]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); @@ -286,16 +292,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB, rowB1; vec_t *rb = (vec_t *) & BO[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 1]; rb = (vec_t *) & BO[l << 3]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); } @@ -398,7 +404,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB; vec_t *rb = (vec_t *) & BO[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); @@ -407,7 +413,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, { rowA = (vec_t *) & AO[l << 3]; rb = (vec_t *) & BO[l << 2]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); @@ -440,14 +446,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB; vec_t *rb = (vec_t *) & BO[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 2]; rb = (vec_t *) & BO[l << 2]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); } @@ -476,13 +482,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB; vec_t *rb = (vec_t *) & BO[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 1]; rb = (vec_t *) & BO[l << 2]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); } SAVE_ACC (&acc0, 0); @@ -562,11 +568,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3; BLASLONG l = 0; - FLOAT t[4] = { 0, 0, 0, 0 }; - t[0] = BO[0], t[1] = BO[1]; __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rb = (vec_t *) & BO[0]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); vec_t *rowA = (vec_t *) & AO[0]; __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); @@ -574,9 +578,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); for (l = 1; l < temp; l++) { - t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + rb = (vec_t *) & BO[l << 1]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); rowA = (vec_t *) & AO[l << 3]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); @@ -607,19 +610,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t result[4]; __vector_quad acc0, acc1; BLASLONG l = 0; - FLOAT t[4] = { 0, 0, 0, 0 }; - t[0] = BO[0], t[1] = BO[1]; __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rb = (vec_t *) & BO[0]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); vec_t *rowA = (vec_t *) & AO[0]; __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); for (l = 1; l < temp; l++) { - t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + rb = (vec_t *) & BO[l << 1]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); rowA = (vec_t *) & AO[l << 2]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); @@ -646,18 +646,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t result[4]; __vector_quad acc0; BLASLONG l = 0; - FLOAT t[4] = { 0, 0, 0, 0 }; - t[0] = BO[0], t[1] = BO[1]; __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rb = (vec_t *) & BO[0]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); vec_t *rowA = (vec_t *) & AO[0]; __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); for (l = 1; l < temp; l++) { - t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + rb = (vec_t *) & BO[l << 1]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); rowA = (vec_t *) & AO[l << 1]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); } diff --git a/kernel/power/drot.c b/kernel/power/drot.c index 951c2f9c9..94d9d95a3 100644 --- a/kernel/power/drot.c +++ b/kernel/power/drot.c @@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "drot_microk_power8.c" +#elif defined(POWER10) +#include "drot_microk_power10.c" #endif #endif @@ -115,12 +117,30 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if ( (inc_x == 1) && (inc_y == 1) ) { +#if defined(POWER10) + if ( n >= 16 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; + for (i = 0; i < align; i++) { + temp = c*x[i] + s*y[i] ; + y[i] = c*y[i] - s*x[i] ; + x[i] = temp ; + } + } + BLASLONG n1 = (n-i) & -16; + if ( n1 > 0 ) + { + drot_kernel_16(n1,&x[i], &y[i], c, s); + i+=n1; + } +#else BLASLONG n1 = n & -16; if ( n1 > 0 ) { drot_kernel_16(n1, x1, y1, c, s); i=n1; } +#endif while(i < n) { diff --git a/kernel/power/drot_microk_power10.c b/kernel/power/drot_microk_power10.c new file mode 100644 index 000000000..e34e745c7 --- /dev/null +++ b/kernel/power/drot_microk_power10.c @@ -0,0 +1,148 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static void drot_kernel_16 (long n, double *x, double *y, double c, double s) +{ + __asm__ + ( + XXSPLTD_S(36,%x5,0) // load c to both dwords + XXSPLTD_S(37,%x6,0) // load s to both dwords + "lxvp 32, 0(%3) \n\t" // load x + "lxvp 34, 32(%3) \n\t" + "lxvp 48, 0(%4) \n\t" // load y + "lxvp 50, 32(%4) \n\t" + + "addic. %2, %2, -8 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmuldp 40, 32, 36 \n\t" // c * x + "xvmuldp 41, 33, 36 \n\t" + "xvmuldp 42, 34, 36 \n\t" + "xvmuldp 43, 35, 36 \n\t" + + "xvmuldp 44, 32, 37 \n\t" // s * x + "xvmuldp 45, 33, 37 \n\t" + "xvmuldp 46, 34, 37 \n\t" + "xvmuldp 47, 35, 37 \n\t" + + "lxvp 32, 64(%3) \n\t" // load x + "lxvp 34, 96(%3) \n\t" + "xvmuldp 52, 48, 36 \n\t" // c * y + "xvmuldp 53, 49, 36 \n\t" + "xvmuldp 54, 50, 36 \n\t" + "xvmuldp 55, 51, 36 \n\t" + + "xvmuldp 38, 48, 37 \n\t" // s * y + "xvmuldp 39, 49, 37 \n\t" + "xvmuldp 56, 50, 37 \n\t" + "xvmuldp 57, 51, 37 \n\t" + + "lxvp 48, 64(%4) \n\t" // load y + "lxvp 50, 96(%4) \n\t" + + "xvadddp 40, 40, 38 \n\t" // c * x + s * y + "xvadddp 41, 41, 39 \n\t" // c * x + s * y + "xvadddp 42, 42, 56 \n\t" // c * x + s * y + "xvadddp 43, 43, 57 \n\t" // c * x + s * y + + "stxvp 40, 0(%3) \n\t" // store x + "stxvp 42, 32(%3) \n\t" + + "xvsubdp 52, 52, 44 \n\t" // c * y - s * x + "xvsubdp 53, 53, 45 \n\t" // c * y - s * x + "xvsubdp 54, 54, 46 \n\t" // c * y - s * x + "xvsubdp 55, 55, 47 \n\t" // c * y - s * x + + "stxvp 52, 0(%4) \n\t" // store y + "stxvp 54, 32(%4) \n\t" + + "addi %3, %3, 64 \n\t" + "addi %4, %4, 64 \n\t" + + "addic. %2, %2, -8 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmuldp 40, 32, 36 \n\t" // c * x + "xvmuldp 41, 33, 36 \n\t" + "xvmuldp 42, 34, 36 \n\t" + "xvmuldp 43, 35, 36 \n\t" + + "xvmuldp 52, 48, 36 \n\t" // c * y + "xvmuldp 53, 49, 36 \n\t" + "xvmuldp 54, 50, 36 \n\t" + "xvmuldp 55, 51, 36 \n\t" + + "xvmuldp 44, 32, 37 \n\t" // s * x + "xvmuldp 45, 33, 37 \n\t" + "xvmuldp 46, 34, 37 \n\t" + "xvmuldp 47, 35, 37 \n\t" + + "xvmuldp 38, 48, 37 \n\t" // s * y + "xvmuldp 39, 49, 37 \n\t" + "xvmuldp 56, 50, 37 \n\t" + "xvmuldp 57, 51, 37 \n\t" + + "xvadddp 40, 40, 38 \n\t" // c * x + s * y + "xvadddp 41, 41, 39 \n\t" // c * x + s * y + "xvadddp 42, 42, 56 \n\t" // c * x + s * y + "xvadddp 43, 43, 57 \n\t" // c * x + s * y + + "stxvp 40, 0(%3) \n\t" // store x + "stxvp 42, 32(%3) \n\t" + "xvsubdp 52, 52, 44 \n\t" // c * y - s * x + "xvsubdp 53, 53, 45 \n\t" // c * y - s * x + "xvsubdp 54, 54, 46 \n\t" // c * y - s * x + "xvsubdp 55, 55, 47 \n\t" // c * y - s * x + + "stxvp 52, 0(%4) \n\t" // store y + "stxvp 54, 32(%4) \n\t" + + "#n=%2 x=%0=%3 y=%1=%4 c=%5 s=%6\n" + : + "+m" (*x), + "+m" (*y), + "+r" (n), // 2 + "+b" (x), // 3 + "+b" (y) // 4 + : + "d" (c), // 5 + "d" (s) // 6 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57" + ); +} diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c index 39293252b..96c4e51bc 100644 --- a/kernel/power/dscal.c +++ b/kernel/power/dscal.c @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "dscal_microk_power8.c" +#elif defined(POWER10) +#include "dscal_microk_power10.c" #endif #endif @@ -100,12 +102,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { +#if defined(POWER10) + if ( n >= 16 ) + { + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; + for (j = 0; j < align; j++) { + x[j] = 0.0; + } + } + BLASLONG n1 = (n-j) & -16; + if ( n1 > 0 ) + { + dscal_kernel_8_zero(n1, &x[j]); + j+=n1; + } +#else BLASLONG n1 = n & -16; if ( n1 > 0 ) { dscal_kernel_8_zero(n1, x); j=n1; } +#endif while(j < n) { @@ -118,12 +136,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { +#if defined(POWER10) + if ( n >= 16 ) + { + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; + for (j = 0; j < align; j++) { + x[j] = da * x[j]; + } + } + BLASLONG n1 = (n-j) & -16; + if ( n1 > 0 ) + { + dscal_kernel_8(n1, &x[j], da); + j+=n1; + } +#else BLASLONG n1 = n & -16; if ( n1 > 0 ) { dscal_kernel_8(n1, x, da); j=n1; } +#endif while(j < n) { diff --git a/kernel/power/dscal_microk_power10.c b/kernel/power/dscal_microk_power10.c new file mode 100644 index 000000000..d0d506f24 --- /dev/null +++ b/kernel/power/dscal_microk_power10.c @@ -0,0 +1,134 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void dscal_kernel_8 (long n, double *x, double alpha) +{ + __asm__ + ( + "dcbt 0, %2 \n\t" + + XXSPLTD_S(48,%x3,0) + + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" + "lxvp 38, 96(%2) \n\t" + + "addic. %1, %1, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmuldp 40, 32, 48 \n\t" + "xvmuldp 41, 33, 48 \n\t" + "xvmuldp 42, 34, 48 \n\t" + "xvmuldp 43, 35, 48 \n\t" + "lxvp 32, 128(%2) \n\t" + "lxvp 34, 160(%2) \n\t" + "xvmuldp 44, 36, 48 \n\t" + "xvmuldp 45, 37, 48 \n\t" + "xvmuldp 46, 38, 48 \n\t" + "xvmuldp 47, 39, 48 \n\t" + "lxvp 36, 192(%2) \n\t" + "lxvp 38, 224(%2) \n\t" + + "stxvp 40, 0(%2) \n\t" + "stxvp 42, 32(%2) \n\t" + "stxvp 44, 64(%2) \n\t" + "stxvp 46, 96(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmuldp 40, 32, 48 \n\t" + "xvmuldp 41, 33, 48 \n\t" + "xvmuldp 42, 34, 48 \n\t" + "xvmuldp 43, 35, 48 \n\t" + + "xvmuldp 44, 36, 48 \n\t" + "xvmuldp 45, 37, 48 \n\t" + "xvmuldp 46, 38, 48 \n\t" + "xvmuldp 47, 39, 48 \n\t" + + "stxvp 40, 0(%2) \n\t" + "stxvp 42, 32(%2) \n\t" + "stxvp 44, 64(%2) \n\t" + "stxvp 46, 96(%2) \n\t" + + "#n=%1 alpha=%3 x=%0=%2" + : + "+m" (*x), + "+r" (n), // 1 + "+b" (x) // 2 + : + "d" (alpha) // 3 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47","vs48" + ); +} + + +static void dscal_kernel_8_zero (long n, double *x) +{ + + __asm__ + ( + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "stxvp 32, 0(%2) \n\t" + "stxvp 32, 32(%2) \n\t" + "stxvp 32, 64(%2) \n\t" + "stxvp 32, 96(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "bgt one%= \n" + + "#n=%1 x=%0=%2 " + : + "=m" (*x), + "+r" (n), // 1 + "+b" (x) // 2 + : + : + "cr0","vs32","vs33" + ); +} diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c index ff3f95c79..9e6229c6a 100644 --- a/kernel/power/dswap.c +++ b/kernel/power/dswap.c @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "dswap_microk_power8.c" +#elif defined(POWER10) +#include "swap_microk_power10.c" #endif #endif @@ -115,12 +117,30 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if ( (inc_x == 1) && (inc_y == 1 )) { +#if defined(POWER10) + if ( n >= 32 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; + for (i = 0; i < align; i++) { + temp = y[i]; + y[i] = x[i]; + x[i] = temp; + } + } + BLASLONG n1 = (n-i) & -32; + if ( n1 > 0 ) + { + dswap_kernel_32(n1,&x[i], &y[i]); + i+=n1; + } +#else BLASLONG n1 = n & -32; if ( n1 > 0 ) { dswap_kernel_32(n1, x, y); i=n1; } +#endif while(i < n) { diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c index 733137012..af692a7fa 100644 --- a/kernel/power/sasum.c +++ b/kernel/power/sasum.c @@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "sasum_microk_power8.c" +#elif defined(POWER10) +#include "sasum_microk_power10.c" #endif #endif @@ -110,6 +112,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( inc_x == 1 ) { +#if defined(POWER10) + if ( n >= 32 ) + { + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; + for (i = 0; i < align; i++) { + sumf += ABS(x[i]); + } + } + n1 = (n-i) & -32; + if ( n1 > 0 ) + { + sumf += sasum_kernel_32(n1, &x[i]); + i+=n1; + } +#else n1 = n & -32; if ( n1 > 0 ) { @@ -117,6 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sumf = sasum_kernel_32(n1, x); i=n1; } +#endif while(i < n) { diff --git a/kernel/power/sasum_microk_power10.c b/kernel/power/sasum_microk_power10.c new file mode 100644 index 000000000..ea12a4264 --- /dev/null +++ b/kernel/power/sasum_microk_power10.c @@ -0,0 +1,153 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#define HAVE_KERNEL_32 1 + +static float sasum_kernel_32 (long n, float *x) +{ + float sum; + __vector float t0; + __vector float t1; + __vector float t2; + __vector float t3; + + __asm__ + ( + "dcbt 0, %2 \n\t" + + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + "xxlxor 34, 34, 34 \n\t" + "xxlxor 35, 35, 35 \n\t" + "xxlxor 36, 36, 36 \n\t" + "xxlxor 37, 37, 37 \n\t" + "xxlxor 38, 38, 38 \n\t" + "xxlxor 39, 39, 39 \n\t" + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -32 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvabssp 48, 40 \n\t" + "xvabssp 49, 41 \n\t" + "xvabssp 50, 42 \n\t" + "xvabssp 51, 43 \n\t" + "lxvp 40, 0(%2) \n\t" + + "xvabssp %x3, 44 \n\t" + "xvabssp %x4, 45 \n\t" + "lxvp 42, 32(%2) \n\t" + + "xvabssp %x5, 46 \n\t" + "xvabssp %x6, 47 \n\t" + "lxvp 44, 64(%2) \n\t" + + "xvaddsp 32, 32, 48 \n\t" + "xvaddsp 33, 33, 49 \n\t" + + "lxvp 46, 96(%2) \n\t" + + "xvaddsp 34, 34, 50 \n\t" + "xvaddsp 35, 35, 51 \n\t" + "addi %2, %2, 128 \n\t" + "xvaddsp 36, 36, %x3 \n\t" + "xvaddsp 37, 37, %x4 \n\t" + "addic. %1, %1, -32 \n\t" + "xvaddsp 38, 38, %x5 \n\t" + "xvaddsp 39, 39, %x6 \n\t" + + "bgt one%= \n" + + "two%=: \n\t" + + "xvabssp 48, 40 \n\t" + "xvabssp 49, 41 \n\t" + "xvabssp 50, 42 \n\t" + "xvabssp 51, 43 \n\t" + "xvabssp %x3, 44 \n\t" + "xvabssp %x4, 45 \n\t" + "xvabssp %x5, 46 \n\t" + "xvabssp %x6, 47 \n\t" + + "xvaddsp 32, 32, 48 \n\t" + "xvaddsp 33, 33, 49 \n\t" + "xvaddsp 34, 34, 50 \n\t" + "xvaddsp 35, 35, 51 \n\t" + "xvaddsp 36, 36, %x3 \n\t" + "xvaddsp 37, 37, %x4 \n\t" + "xvaddsp 38, 38, %x5 \n\t" + "xvaddsp 39, 39, %x6 \n\t" + + "xvaddsp 32, 32, 33 \n\t" + "xvaddsp 34, 34, 35 \n\t" + "xvaddsp 36, 36, 37 \n\t" + "xvaddsp 38, 38, 39 \n\t" + + "xvaddsp 32, 32, 34 \n\t" + "xvaddsp 36, 36, 38 \n\t" + + "xvaddsp 32, 32, 36 \n\t" + + "xxsldwi 33, 32, 32, 2 \n\t" + "xvaddsp 32, 32, 33 \n\t" + + "xxsldwi 33, 32, 32, 1 \n\t" + "xvaddsp 32, 32, 33 \n\t" + + "xscvspdp %x0, 32 \n" + + "#n=%1 x=%3=%2 sum=%0\n" + "#t0=%x3 t1=%x4 t2=%x5 t3=%x6" + : + "=f" (sum), // 0 + "+r" (n), // 1 + "+b" (x), // 2 + "=wa" (t0), // 3 + "=wa" (t1), // 4 + "=wa" (t2), // 5 + "=wa" (t3) // 6 + : + "m" (*x) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); + + return sum; +} diff --git a/kernel/power/scopy_power10.c b/kernel/power/scopy_power10.c index 298a8998a..3398ce827 100644 --- a/kernel/power/scopy_power10.c +++ b/kernel/power/scopy_power10.c @@ -86,11 +86,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1 )) { - BLASLONG n1 = n & -128; - if ( n1 > 0 ) + if ( n >= 128 ) { - copy_kernel (n1, x, y); - i=n1; + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; + for (i = 0; i < align; i++) { + y[i] = x[i] ; + } + } + BLASLONG n1 = (n-i) & -128; + if ( n1 ) + { + copy_kernel(n1, &x[i], &y[i]); + i += n1; } while(i < n) diff --git a/kernel/power/srot.c b/kernel/power/srot.c index a53342f61..3e4f93e2a 100644 --- a/kernel/power/srot.c +++ b/kernel/power/srot.c @@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "srot_microk_power8.c" +#elif defined(POWER10) +#include "srot_microk_power10.c" #endif #endif @@ -115,6 +117,23 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if ( (inc_x == 1) && (inc_y == 1) ) { +#if defined(POWER10) + if ( n >= 16 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; + for (i = 0; i < align; i++) { + temp = c*x[i] + s*y[i] ; + y[i] = c*y[i] - s*x[i] ; + x[i] = temp ; + } + } + BLASLONG n1 = (n-i) & -16; + if ( n1 > 0 ) + { + srot_kernel_16(n1, &x1[i], &y1[i], c, s); + i+=n1; + } +#else BLASLONG n1 = n & -16; if ( n1 > 0 ) { @@ -122,6 +141,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT i=n1; } +#endif while(i < n) { temp = c*x[i] + s*y[i] ; diff --git a/kernel/power/srot_microk_power10.c b/kernel/power/srot_microk_power10.c new file mode 100644 index 000000000..c54c30742 --- /dev/null +++ b/kernel/power/srot_microk_power10.c @@ -0,0 +1,151 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static void srot_kernel_16 (long n, float *x, float *y, float c, float s) +{ + __asm__ + ( + "xscvdpspn 36, %x5 \n\t" // load c to all words + "xxspltw 36, 36, 0 \n\t" + + "xscvdpspn 37, %x6 \n\t" // load s to all words + "xxspltw 37, 37, 0 \n\t" + "lxvp 32, 0(%3) \n\t" // load x + "lxvp 34, 32(%3) \n\t" + "lxvp 48, 0(%4) \n\t" // load y + "lxvp 50, 32(%4) \n\t" + + "addic. %2, %2, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmulsp 40, 32, 36 \n\t" // c * x + "xvmulsp 41, 33, 36 \n\t" + "xvmulsp 42, 34, 36 \n\t" + "xvmulsp 43, 35, 36 \n\t" + + "xvmulsp 44, 32, 37 \n\t" // s * x + "xvmulsp 45, 33, 37 \n\t" + "xvmulsp 46, 34, 37 \n\t" + "xvmulsp 47, 35, 37 \n\t" + + "lxvp 32, 64(%3) \n\t" // load x + "lxvp 34, 96(%3) \n\t" + "xvmulsp 52, 48, 36 \n\t" // c * y + "xvmulsp 53, 49, 36 \n\t" + "xvmulsp 54, 50, 36 \n\t" + "xvmulsp 55, 51, 36 \n\t" + + "xvmulsp 38, 48, 37 \n\t" // s * y + "xvmulsp 39, 49, 37 \n\t" + "xvmulsp 56, 50, 37 \n\t" + "xvmulsp 57, 51, 37 \n\t" + + "lxvp 48, 64(%4) \n\t" // load y + "lxvp 50, 96(%4) \n\t" + + "xvaddsp 40, 40, 38 \n\t" // c * x + s * y + "xvaddsp 41, 41, 39 \n\t" // c * x + s * y + "xvaddsp 42, 42, 56 \n\t" // c * x + s * y + "xvaddsp 43, 43, 57 \n\t" // c * x + s * y + + "stxvp 40, 0(%3) \n\t" // store x + "stxvp 42, 32(%3) \n\t" + + "xvsubsp 52, 52, 44 \n\t" // c * y - s * x + "xvsubsp 53, 53, 45 \n\t" // c * y - s * x + "xvsubsp 54, 54, 46 \n\t" // c * y - s * x + "xvsubsp 55, 55, 47 \n\t" // c * y - s * x + + "stxvp 52, 0(%4) \n\t" // store y + "stxvp 54, 32(%4) \n\t" + + "addi %3, %3, 64 \n\t" + "addi %4, %4, 64 \n\t" + + "addic. %2, %2, -16 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmulsp 40, 32, 36 \n\t" // c * x + "xvmulsp 41, 33, 36 \n\t" + "xvmulsp 42, 34, 36 \n\t" + "xvmulsp 43, 35, 36 \n\t" + + "xvmulsp 52, 48, 36 \n\t" // c * y + "xvmulsp 53, 49, 36 \n\t" + "xvmulsp 54, 50, 36 \n\t" + "xvmulsp 55, 51, 36 \n\t" + + "xvmulsp 44, 32, 37 \n\t" // s * x + "xvmulsp 45, 33, 37 \n\t" + "xvmulsp 46, 34, 37 \n\t" + "xvmulsp 47, 35, 37 \n\t" + + "xvmulsp 38, 48, 37 \n\t" // s * y + "xvmulsp 39, 49, 37 \n\t" + "xvmulsp 56, 50, 37 \n\t" + "xvmulsp 57, 51, 37 \n\t" + + "xvaddsp 40, 40, 38 \n\t" // c * x + s * y + "xvaddsp 41, 41, 39 \n\t" // c * x + s * y + "xvaddsp 42, 42, 56 \n\t" // c * x + s * y + "xvaddsp 43, 43, 57 \n\t" // c * x + s * y + + "stxvp 40, 0(%3) \n\t" // store x + "stxvp 42, 32(%3) \n\t" + "xvsubsp 52, 52, 44 \n\t" // c * y - s * x + "xvsubsp 53, 53, 45 \n\t" // c * y - s * x + "xvsubsp 54, 54, 46 \n\t" // c * y - s * x + "xvsubsp 55, 55, 47 \n\t" // c * y - s * x + + "stxvp 52, 0(%4) \n\t" // store y + "stxvp 54, 32(%4) \n\t" + + "#n=%2 x=%0=%3 y=%1=%4 c=%5 s=%6\n" + : + "+m" (*x), + "+m" (*y), + "+r" (n), // 2 + "+b" (x), // 3 + "+b" (y) // 4 + : + "f" (c), // 5 + "f" (s) // 6 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57" + ); +} diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c index de37e10a5..65572a8c1 100644 --- a/kernel/power/sscal.c +++ b/kernel/power/sscal.c @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "sscal_microk_power8.c" +#elif defined(POWER10) +#include "sscal_microk_power10.c" #endif #endif @@ -102,12 +104,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { +#if defined(POWER10) + if ( n >= 32 ) + { + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; + for (j = 0; j < align; j++) { + x[j] = 0.0; + } + } + BLASLONG n1 = (n-j) & -32; + if ( n1 > 0 ) + { + sscal_kernel_16_zero(n1, &x[j]); + j+=n1; + } +#else BLASLONG n1 = n & -32; if ( n1 > 0 ) { sscal_kernel_16_zero(n1, x); j=n1; } +#endif while(j < n) { @@ -120,12 +138,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { +#if defined(POWER10) + if ( n >= 32 ) + { + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; + for (j = 0; j < align; j++) { + x[j] = da * x[j]; + } + } + BLASLONG n1 = (n-j) & -32; + if ( n1 > 0 ) + { + sscal_kernel_16(n1, &x[j], da); + j+=n1; + } +#else BLASLONG n1 = n & -32; if ( n1 > 0 ) { sscal_kernel_16(n1, x, da); j=n1; } +#endif while(j < n) { diff --git a/kernel/power/sscal_microk_power10.c b/kernel/power/sscal_microk_power10.c new file mode 100644 index 000000000..a523a1675 --- /dev/null +++ b/kernel/power/sscal_microk_power10.c @@ -0,0 +1,135 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static void sscal_kernel_16 (long n, float *x, float alpha) +{ + __asm__ + ( + "dcbt 0, %2 \n\t" + + "xscvdpspn 48, %x3 \n\t" + "xxspltw 48, 48, 0 \n\t" + + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" + "lxvp 38, 96(%2) \n\t" + + "addic. %1, %1, -32 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmulsp 40, 32, 48 \n\t" + "xvmulsp 41, 33, 48 \n\t" + "xvmulsp 42, 34, 48 \n\t" + "xvmulsp 43, 35, 48 \n\t" + "lxvp 32, 128(%2) \n\t" + "lxvp 34, 160(%2) \n\t" + "xvmulsp 44, 36, 48 \n\t" + "xvmulsp 45, 37, 48 \n\t" + "xvmulsp 46, 38, 48 \n\t" + "xvmulsp 47, 39, 48 \n\t" + "lxvp 36, 192(%2) \n\t" + "lxvp 38, 224(%2) \n\t" + + "stxvp 40, 0(%2) \n\t" + "stxvp 42, 32(%2) \n\t" + "stxvp 44, 64(%2) \n\t" + "stxvp 46, 96(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -32 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmulsp 40, 32, 48 \n\t" + "xvmulsp 41, 33, 48 \n\t" + "xvmulsp 42, 34, 48 \n\t" + "xvmulsp 43, 35, 48 \n\t" + + "xvmulsp 44, 36, 48 \n\t" + "xvmulsp 45, 37, 48 \n\t" + "xvmulsp 46, 38, 48 \n\t" + "xvmulsp 47, 39, 48 \n\t" + + "stxvp 40, 0(%2) \n\t" + "stxvp 42, 32(%2) \n\t" + "stxvp 44, 64(%2) \n\t" + "stxvp 46, 96(%2) \n\t" + + "#n=%1 alpha=%3 x=%0=%2" + : + "+m" (*x), + "+r" (n), // 1 + "+b" (x) // 2 + : + "f" (alpha) // 3 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47","vs48" + ); +} + + +static void sscal_kernel_16_zero (long n, float *x) +{ + + __asm__ + ( + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "stxvp 32, 0(%2) \n\t" + "stxvp 32, 32(%2) \n\t" + "stxvp 32, 64(%2) \n\t" + "stxvp 32, 96(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -32 \n\t" + "bgt one%= \n" + + "#n=%1 x=%0=%2 " + : + "=m" (*x), + "+r" (n), // 1 + "+b" (x) // 2 + : + : + "cr0","vs32","vs33" + ); +} diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c index 44522f0a0..dd249fd36 100644 --- a/kernel/power/sswap.c +++ b/kernel/power/sswap.c @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "sswap_microk_power8.c" +#elif defined(POWER10) +#include "swap_microk_power10.c" #endif #endif @@ -115,12 +117,30 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if ( (inc_x == 1) && (inc_y == 1 )) { +#if defined(POWER10) + if ( n >= 64 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; + for (i = 0; i < align; i++) { + temp = y[i]; + y[i] = x[i]; + x[i] = temp; + } + } + BLASLONG n1 = (n-i) & -64; + if ( n1 > 0 ) + { + sswap_kernel_32(n1,&x[i], &y[i]); + i+=n1; + } +#else BLASLONG n1 = n & -32; if ( n1 > 0 ) { sswap_kernel_32(n1, x, y); i=n1; } +#endif while(i < n) { diff --git a/kernel/power/swap_microk_power10.c b/kernel/power/swap_microk_power10.c new file mode 100644 index 000000000..f9c1fee52 --- /dev/null +++ b/kernel/power/swap_microk_power10.c @@ -0,0 +1,105 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define HAVE_KERNEL_32 1 + +#if defined(DOUBLE) +static void dswap_kernel_32 (long n, double *x, double *y) +#else +static void sswap_kernel_32 (long n, float *x, float *y) +#endif +{ + __asm__ + ( + ".align 5 \n" + "one%=: \n\t" + + "lxvp 32, 0(%4) \n\t" + "lxvp 34, 32(%4) \n\t" + "lxvp 36, 64(%4) \n\t" + "lxvp 38, 96(%4) \n\t" + + "lxvp 40, 128(%4) \n\t" + "lxvp 42, 160(%4) \n\t" + "lxvp 44, 192(%4) \n\t" + "lxvp 46, 224(%4) \n\t" + + "lxvp 48, 0(%3) \n\t" + "lxvp 50, 32(%3) \n\t" + "lxvp 52, 64(%3) \n\t" + "lxvp 54, 96(%3) \n\t" + + "lxvp 56, 128(%3) \n\t" + "lxvp 58, 160(%3) \n\t" + "lxvp 60, 192(%3) \n\t" + "lxvp 62, 224(%3) \n\t" + + "stxvp 32, 0(%3) \n\t" + "stxvp 34, 32(%3) \n\t" + "stxvp 36, 64(%3) \n\t" + "stxvp 38, 96(%3) \n\t" + + "stxvp 40, 128(%3) \n\t" + "stxvp 42, 160(%3) \n\t" + "stxvp 44, 192(%3) \n\t" + "stxvp 46, 224(%3) \n\t" + + "stxvp 48, 0(%4) \n\t" + "stxvp 50, 32(%4) \n\t" + "stxvp 52, 64(%4) \n\t" + "stxvp 54, 96(%4) \n\t" + + "stxvp 56, 128(%4) \n\t" + "stxvp 58, 160(%4) \n\t" + "stxvp 60, 192(%4) \n\t" + "stxvp 62, 224(%4) \n\t" + + "addi %4, %4, 256 \n\t" + "addi %3, %3, 256 \n\t" + +#if defined(DOUBLE) + "addic. %2, %2, -32 \n\t" +#else + "addic. %2, %2, -64 \n\t" +#endif + "bgt one%= \n" + + "#n=%2 x=%0=%3 y=%1=%4" + : + "+m" (*x), + "+m" (*y), + "+r" (n), // 2 + "+b" (x), // 3 + "+b" (y) // 4 + : + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" + ); +} diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index 5526f4d67..0068138e8 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -38,11 +38,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #if defined(DOUBLE) #include "zscal_microk_power8.c" #endif +#elif defined(POWER10) +#if defined(DOUBLE) +#include "zscal_microk_power10.c" +#else +#include "cscal_microk_power10.c" +#endif #endif #endif @@ -145,7 +151,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F { +#if defined(DOUBLE) n1 = n & -8; +#else + n1 = n & -16; +#endif if ( n1 > 0 ) { zscal_kernel_8(n1, x, da_r, da_i); diff --git a/kernel/power/zscal_microk_power10.c b/kernel/power/zscal_microk_power10.c new file mode 100644 index 000000000..15b8323f4 --- /dev/null +++ b/kernel/power/zscal_microk_power10.c @@ -0,0 +1,195 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) +{ + __vector double t0; + __vector double t1; + __vector double t2; + __vector double t3; + __vector double t4; + __vector double t5; + + __asm__ + ( + "dcbt 0, %2 \n\t" + + "xsnegdp 33, %x10 \n\t" // -alpha_i + XXSPLTD_S(32,%x9,0) // alpha_r , alpha_r + XXMRGHD_S(33,%x10, 33) // -alpha_i , alpha_i + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + + "addic. %1, %1, -8 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r + "xvmuldp 49, 41, 32 \n\t" + "xvmuldp 50, 42, 32 \n\t" + "xvmuldp 51, 43, 32 \n\t" + "xvmuldp 34, 44, 32 \n\t" + "xvmuldp 35, 45, 32 \n\t" + "xvmuldp 36, 46, 32 \n\t" + "xvmuldp 37, 47, 32 \n\t" + + XXSWAPD_S(38,40) + XXSWAPD_S(39,41) + XXSWAPD_S(%x3,42) + XXSWAPD_S(%x4,43) + XXSWAPD_S(%x5,44) + XXSWAPD_S(%x6,45) + XXSWAPD_S(%x7,46) + XXSWAPD_S(%x8,47) + + "xvmuldp 38, 38, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i + "xvmuldp 39, 39, 33 \n\t" + + + "xvmuldp %x3, %x3, 33 \n\t" + "xvmuldp %x4, %x4, 33 \n\t" + + + "lxvp 40, 128(%2) \n\t" + "lxvp 42, 160(%2) \n\t" + "xvmuldp %x5, %x5, 33 \n\t" + "xvmuldp %x6, %x6, 33 \n\t" + + + "xvmuldp %x7, %x7, 33 \n\t" + "xvmuldp %x8, %x8, 33 \n\t" + "lxvp 44, 192(%2) \n\t" + "lxvp 46, 224(%2) \n\t" + + + "xvadddp 48, 48, 38 \n\t" + "xvadddp 49, 49, 39 \n\t" + "xvadddp 50, 50, %x3 \n\t" + "xvadddp 51, 51, %x4 \n\t" + "stxv 49, 0(%2) \n\t" + "stxv 48, 16(%2) \n\t" + "stxv 51, 32(%2) \n\t" + "stxv 50, 48(%2) \n\t" + + + "xvadddp 34, 34, %x5 \n\t" + "xvadddp 35, 35, %x6 \n\t" + + + "xvadddp 36, 36, %x7 \n\t" + "xvadddp 37, 37, %x8 \n\t" + + "stxv 35, 64(%2) \n\t" + "stxv 34, 80(%2) \n\t" + "stxv 37, 96(%2) \n\t" + "stxv 36, 112(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -8 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r + "xvmuldp 49, 41, 32 \n\t" + "xvmuldp 50, 42, 32 \n\t" + "xvmuldp 51, 43, 32 \n\t" + "xvmuldp 34, 44, 32 \n\t" + "xvmuldp 35, 45, 32 \n\t" + "xvmuldp 36, 46, 32 \n\t" + "xvmuldp 37, 47, 32 \n\t" + + XXSWAPD_S(38,40) + XXSWAPD_S(39,41) + XXSWAPD_S(%x3,42) + XXSWAPD_S(%x4,43) + XXSWAPD_S(%x5,44) + XXSWAPD_S(%x6,45) + XXSWAPD_S(%x7,46) + XXSWAPD_S(%x8,47) + + + "xvmuldp 38, 38, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i + "xvmuldp 39, 39, 33 \n\t" + "xvmuldp %x3, %x3, 33 \n\t" + "xvmuldp %x4, %x4, 33 \n\t" + "xvmuldp %x5, %x5, 33 \n\t" + "xvmuldp %x6, %x6, 33 \n\t" + "xvmuldp %x7, %x7, 33 \n\t" + "xvmuldp %x8, %x8, 33 \n\t" + + "xvadddp 48, 48, 38 \n\t" + "xvadddp 49, 49, 39 \n\t" + + "xvadddp 50, 50, %x3 \n\t" + "xvadddp 51, 51, %x4 \n\t" + "stxv 49, 0(%2) \n\t" + "stxv 48, 16(%2) \n\t" + "stxv 51, 32(%2) \n\t" + "stxv 50, 48(%2) \n\t" + + "xvadddp 34, 34, %x5 \n\t" + "xvadddp 35, 35, %x6 \n\t" + + + "xvadddp 36, 36, %x7 \n\t" + "xvadddp 37, 37, %x8 \n\t" + + "stxv 35, 64(%2) \n\t" + "stxv 34, 80(%2) \n\t" + "stxv 37, 96(%2) \n\t" + "stxv 36, 112(%2) \n\t" + + "#n=%1 x=%0=%2 alpha=(%9,%10) \n" + : + "+m" (*x), + "+r" (n), // 1 + "+b" (x), // 2 + "=wa" (t0), // 3 + "=wa" (t1), // 4 + "=wa" (t2), // 5 + "=wa" (t3), // 6 + "=wa" (t4), // 7 + "=wa" (t5) // 8 + : + "d" (alpha_r), // 9 + "d" (alpha_i) // 10 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); +} diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c index 3a5a8eb83..6cd3d9664 100644 --- a/kernel/power/zswap.c +++ b/kernel/power/zswap.c @@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "zswap_microk_power8.c" +#elif defined(POWER10) +#include "cswap_microk_power10.c" #endif #endif diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index b92f480e9..5da79cc3f 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -489,3 +489,6 @@ XGEMM3MKERNEL = xgemm3m_kernel_2x2.S SSUMKERNEL = ../arm/sum.c DSUMKERNEL = ../arm/sum.c + +SOMATCOPY_RT = omatcopy_rt.c +DOMATCOPY_RT = omatcopy_rt.c diff --git a/kernel/x86_64/KERNEL.ZEN b/kernel/x86_64/KERNEL.ZEN index 7bb308fea..a66394be3 100644 --- a/kernel/x86_64/KERNEL.ZEN +++ b/kernel/x86_64/KERNEL.ZEN @@ -97,3 +97,5 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c +SROTKERNEL = srot.c +DROTKERNEL = drot.c diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c index ddec21383..8af9e798b 100644 --- a/kernel/x86_64/dasum.c +++ b/kernel/x86_64/dasum.c @@ -6,7 +6,7 @@ #if defined(SKYLAKEX) #include "dasum_microk_skylakex-2.c" -#elif defined(HASWELL) +#elif defined(HASWELL) || defined(ZEN) #include "dasum_microk_haswell-2.c" #endif @@ -93,7 +93,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) #if defined(SMP) int nthreads; FLOAT dummy_alpha; - FLOAT * dummy_b; #endif FLOAT sumf = 0.0; @@ -115,7 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) #else mode = BLAS_DOUBLE | BLAS_REAL; #endif - blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, dummy_b, 0, result, 0, (void *)asum_thread_function, nthreads); + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads); ptr = (FLOAT *)result; for (i = 0; i < nthreads; i++) { sumf += (*ptr); diff --git a/kernel/x86_64/drot.c b/kernel/x86_64/drot.c index 66e9ff907..ab5048bd1 100644 --- a/kernel/x86_64/drot.c +++ b/kernel/x86_64/drot.c @@ -2,7 +2,7 @@ #if defined(SKYLAKEX) #include "drot_microk_skylakex-2.c" -#elif defined(HASWELL) +#elif defined(HASWELL) || defined(ZEN) #include "drot_microk_haswell-2.c" #endif diff --git a/kernel/x86_64/omatcopy_rt.c b/kernel/x86_64/omatcopy_rt.c new file mode 100644 index 000000000..e695f00c5 --- /dev/null +++ b/kernel/x86_64/omatcopy_rt.c @@ -0,0 +1,373 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#ifdef HAVE_AVX + +#define ROWS_OF_BLOCK 384 + + /* +r: %0 = src, %1 = dst, %2 = src_ld, %3 = dst_ld, %4 = dst_tmp */ +/* m: %5 = num_rows, %6 = alpha */ +/* xmm15 = alpha */ +#define TRANS_4x4(a1_no,a2_no,a3_no,a4_no,t1_no,t2_no,t3_no,t4_no)\ + "vunpcklps %%xmm"#a2_no",%%xmm"#a1_no",%%xmm"#t1_no"; vunpckhps %%xmm"#a2_no",%%xmm"#a1_no",%%xmm"#t2_no";"\ + "vunpcklps %%xmm"#a4_no",%%xmm"#a3_no",%%xmm"#t3_no"; vunpckhps %%xmm"#a4_no",%%xmm"#a3_no",%%xmm"#t4_no";"\ + "vunpcklpd %%xmm"#t3_no",%%xmm"#t1_no",%%xmm"#a1_no"; vunpckhpd %%xmm"#t3_no",%%xmm"#t1_no",%%xmm"#a2_no";"\ + "vunpcklpd %%xmm"#t4_no",%%xmm"#t2_no",%%xmm"#a3_no"; vunpckhpd %%xmm"#t4_no",%%xmm"#t2_no",%%xmm"#a4_no";" + +#define TRANS_4x8(a1_no,a2_no,a3_no,a4_no,t1_no,t2_no,t3_no,t4_no)\ + "vunpcklps %%ymm"#a2_no",%%ymm"#a1_no",%%ymm"#t1_no"; vunpckhps %%ymm"#a2_no",%%ymm"#a1_no",%%ymm"#t2_no";"\ + "vunpcklps %%ymm"#a4_no",%%ymm"#a3_no",%%ymm"#t3_no"; vunpckhps %%ymm"#a4_no",%%ymm"#a3_no",%%ymm"#t4_no";"\ + "vunpcklpd %%ymm"#t3_no",%%ymm"#t1_no",%%ymm"#a1_no"; vunpckhpd %%ymm"#t3_no",%%ymm"#t1_no",%%ymm"#a2_no";"\ + "vunpcklpd %%ymm"#t4_no",%%ymm"#t2_no",%%ymm"#a3_no"; vunpckhpd %%ymm"#t4_no",%%ymm"#t2_no",%%ymm"#a4_no";" + +#define SAVE_4x4(b1_no,b2_no,b3_no,b4_no)\ + "vmovups %%xmm"#b1_no",(%4); vmovups %%xmm"#b2_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ + "vmovups %%xmm"#b3_no",(%4); vmovups %%xmm"#b4_no",(%4,%3,1); leaq (%4,%3,2),%4;" + +#define SAVE_4x8(b1_no,b2_no,b3_no,b4_no) SAVE_4x4(b1_no,b2_no,b3_no,b4_no)\ + "vextractf128 $1,%%ymm"#b1_no",(%4); vextractf128 $1,%%ymm"#b2_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ + "vextractf128 $1,%%ymm"#b3_no",(%4); vextractf128 $1,%%ymm"#b4_no",(%4,%3,1); leaq (%4,%3,2),%4;" + +#define COPY_4x16 "movq %1,%4; addq $16,%1;"\ + "vmulps (%0),%%ymm15,%%ymm0; vmulps 32(%0),%%ymm15,%%ymm4; vmulps (%0,%2,1),%%ymm15,%%ymm1; vmulps 32(%0,%2,1),%%ymm15,%%ymm5; leaq (%0,%2,2),%0;"\ + "vmulps (%0),%%ymm15,%%ymm2; vmulps 32(%0),%%ymm15,%%ymm6; vmulps (%0,%2,1),%%ymm15,%%ymm3; vmulps 32(%0,%2,1),%%ymm15,%%ymm7; leaq (%0,%2,2),%0;"\ + TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3)\ + TRANS_4x8(4,5,6,7,8,9,10,11) SAVE_4x8(4,5,6,7) + +#define COPY_4x8 "movq %1,%4; addq $16,%1;"\ + "vmulps (%0),%%ymm15,%%ymm0; vmulps (%0,%2,1),%%ymm15,%%ymm1; leaq (%0,%2,2),%0;"\ + "vmulps (%0),%%ymm15,%%ymm2; vmulps (%0,%2,1),%%ymm15,%%ymm3; leaq (%0,%2,2),%0;"\ + TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3) + +#define COPY_4x4 "movq %1,%4; addq $16,%1;"\ + "vmulps (%0),%%xmm15,%%xmm0; vmulps (%0,%2,1),%%xmm15,%%xmm1; leaq (%0,%2,2),%0;"\ + "vmulps (%0),%%xmm15,%%xmm2; vmulps (%0,%2,1),%%xmm15,%%xmm3; leaq (%0,%2,2),%0;"\ + TRANS_4x4(0,1,2,3,8,9,10,11) SAVE_4x4(0,1,2,3) + +#define COPY_4x2 \ + "vmovsd (%0),%%xmm0; vmovhpd (%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\ + "vmovsd (%0),%%xmm1; vmovhpd (%0,%2,1),%%xmm1,%%xmm1; vmulps %%xmm15,%%xmm1,%%xmm1; leaq (%0,%2,2),%0;"\ + "vpermilps $216,%%xmm0,%%xmm0; vpermilps $216,%%xmm1,%%xmm1; vunpcklpd %%xmm1,%%xmm0,%%xmm2; vunpckhpd %%xmm1,%%xmm0,%%xmm3;"\ + "vmovups %%xmm2,(%1); vmovups %%xmm3,(%1,%3,1); addq $16,%1;" + +#define COPY_4x1 \ + "vmovss (%0),%%xmm0; vinsertps $16,(%0,%2,1),%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\ + "vinsertps $32,(%0),%%xmm0,%%xmm0; vinsertps $48,(%0,%2,1),%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\ + "vmulps %%xmm15,%%xmm0,%%xmm0; vmovups %%xmm0,(%1); addq $16,%1;" + +#define SAVE_2x4(c1_no,c2_no,t1_no,t2_no) \ + "vunpcklps %%xmm"#c2_no",%%xmm"#c1_no",%%xmm"#t1_no"; vmulps %%xmm15,%%xmm"#t1_no",%%xmm"#t1_no";"\ + "vmovsd %%xmm"#t1_no",(%4); vmovhpd %%xmm"#t1_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ + "vunpckhps %%xmm"#c2_no",%%xmm"#c1_no",%%xmm"#t2_no"; vmulps %%xmm15,%%xmm"#t2_no",%%xmm"#t2_no";"\ + "vmovsd %%xmm"#t2_no",(%4); vmovhpd %%xmm"#t2_no",(%4,%3,1); leaq (%4,%3,2),%4;" + +#define COPY_2x16 "movq %1,%4; addq $8,%1;"\ + "vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm2; vmovups (%0,%2,1),%%ymm1; vmovups 32(%0,%2,1),%%ymm3; leaq (%0,%2,2),%0;"\ + "vextractf128 $1,%%ymm0,%%xmm4; vextractf128 $1,%%ymm2,%%xmm6; vextractf128 $1,%%ymm1,%%xmm5; vextractf128 $1,%%ymm3,%%xmm7;"\ + SAVE_2x4(0,1,8,9) SAVE_2x4(4,5,8,9) SAVE_2x4(2,3,8,9) SAVE_2x4(6,7,8,9) + +#define COPY_2x8 "movq %1,%4; addq $8,%1;"\ + "vmovups (%0),%%ymm0; vmovups (%0,%2,1),%%ymm1; leaq (%0,%2,2),%0;"\ + "vextractf128 $1,%%ymm0,%%xmm2; vextractf128 $1,%%ymm1,%%xmm3;"\ + SAVE_2x4(0,1,4,5) SAVE_2x4(2,3,4,5) + +#define COPY_2x4 "movq %1,%4; addq $8,%1;"\ + "vmovups (%0),%%xmm0; vmovups (%0,%2,1),%%xmm1; leaq (%0,%2,2),%0;"\ + SAVE_2x4(0,1,4,5) + +#define COPY_2x2 \ + "vmovsd (%0),%%xmm0; vmovhpd (%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0; vpermilps $216,%%xmm0,%%xmm0;"\ + "vmovsd %%xmm0,(%1); vmovhpd %%xmm0,(%1,%3,1); addq $8,%1;" + +#define COPY_2x1 \ + "vmovss (%0),%%xmm0; vinsertps $16,(%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0; vmovsd %%xmm0,(%1); addq $8,%1;" + +#define SAVE_1x4(c1_no)\ + "vmulps %%xmm15,%%xmm"#c1_no",%%xmm"#c1_no"; vmovss %%xmm"#c1_no",(%4); vextractps $1,%%xmm"#c1_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ + "vextractps $2,%%xmm"#c1_no",(%4); vextractps $3,%%xmm"#c1_no",(%4,%3,1); leaq (%4,%3,2),%4;" + +#define COPY_1x16 "movq %1,%4; addq $4,%1;"\ + "vmovups (%0),%%xmm1;" SAVE_1x4(1) "vmovups 16(%0),%%xmm2;" SAVE_1x4(2)\ + "vmovups 32(%0),%%xmm1;" SAVE_1x4(1) "vmovups 48(%0),%%xmm2;" SAVE_1x4(2) "addq %2,%0;" + +#define COPY_1x8 "movq %1,%4; addq $4,%1;"\ + "vmovups (%0),%%xmm1;" SAVE_1x4(1) "vmovups 16(%0),%%xmm2;" SAVE_1x4(2) "addq %2,%0;" + +#define COPY_1x4 "movq %1,%4; addq $4,%1; vmovups (%0),%%xmm1;" SAVE_1x4(1) "addq %2,%0;" + +#define COPY_1x2 "vmovsd (%0),%%xmm1; addq %2,%0; vmulps %%xmm15,%%xmm1,%%xmm1; vmovss %%xmm1,(%1); vextractps $1,%%xmm1,(%1,%3,1); addq $4,%1;" + +#define COPY_1x1 "vmulss (%0),%%xmm15,%%xmm1; vmovss %%xmm1,(%1); addq %2,%0; addq $4,%1;" + +#define COMPUTE(ndim){\ + src = src_base; dst = dst_base;\ + __asm__ __volatile__(\ + "vbroadcastss %6,%%ymm15; movq %5,%%r11; cmpq $4,%%r11; jb "#ndim"32f;"\ + #ndim"31:\n\t"\ + COPY_4x##ndim "subq $4,%%r11; cmpq $4,%%r11; jnb "#ndim"31b;"\ + #ndim"32:\n\t"\ + "cmpq $2,%%r11; jb "#ndim"33f;"\ + COPY_2x##ndim "subq $2,%%r11;"\ + #ndim"33:\n\t"\ + "testq %%r11,%%r11; jz "#ndim"34f;"\ + COPY_1x##ndim "subq $1,%%r11;"\ + #ndim"34:\n\t"\ + :"+r"(src),"+r"(dst),"+r"(src_ld_bytes),"+r"(dst_ld_bytes),"+r"(dst_tmp):"m"(num_rows),"m"(ALPHA):"r11","cc","memory"\ + ,"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ +} +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb){ + float *src, *dst, *dst_tmp, *src_base, *dst_base; + uint64_t src_ld_bytes = (uint64_t)lda * sizeof(float), dst_ld_bytes = (uint64_t)ldb * sizeof(float), num_rows = 0; + BLASLONG cols_left, rows_done; float ALPHA = alpha; + if(ALPHA==0.0){ + dst_base = b; + for(cols_left=cols;cols_left>0;cols_left--) {memset(dst_base,0,rows*sizeof(float)); dst_base += ldb;} + return 0; + } + for(rows_done=0;rows_done ROWS_OF_BLOCK) num_rows = ROWS_OF_BLOCK; + cols_left = cols; src_base = a + (int64_t)lda * (int64_t)rows_done; dst_base = b + rows_done; + if(ldb%1024>3 && ldb%1024<1021) for(;cols_left>15;cols_left-=16){COMPUTE(16) src_base += 16; dst_base += 16 * ldb;} + for(;cols_left>7;cols_left-=8){COMPUTE(8) src_base += 8; dst_base += 8 * ldb;} + for(;cols_left>3;cols_left-=4){COMPUTE(4) src_base += 4; dst_base += 4 * ldb;} + for(;cols_left>1;cols_left-=2){COMPUTE(2) src_base += 2; dst_base += 2 * ldb;} + if(cols_left>0){COMPUTE(1) src_base ++; dst_base += ldb;} + } + return 0; +} + +#else + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) +{ + BLASLONG i, j; + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4; + + if (rows <= 0) return 0; + if (cols <= 0) return 0; + + a_offset = a; + b_offset = b; + + i = (rows >> 2); + if (i > 0) { + do { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + b_offset += 4; + + j = (cols >> 2); + if (j > 0) { + do { + /* Column 1 of MAT_B */ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + /* Column 2 of MAT_B */ + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; + *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; + + /* Column 3 of MAT_B */ + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A + *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; + *(b_offset3 + 2) = *(a_offset3 + 2)*alpha; + *(b_offset4 + 2) = *(a_offset3 + 3)*alpha; + + /* Column 4 of MAT_B */ + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A + *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; + *(b_offset3 + 3) = *(a_offset4 + 2)*alpha; + *(b_offset4 + 3) = *(a_offset4 + 3)*alpha; + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } // if(j > 0) + + + if (cols & 2) { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; + *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; + + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; + *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + b_offset1 += ldb*2; + + } + + if (cols & 1) { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; + + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; + } + + i--; + } while (i > 0); + } + + + if (rows & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + b_offset += 2; + + j = (cols >> 2); + if (j > 0){ + do { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; + *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; + + a_offset1 += 4; + a_offset2 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } + + + if (cols & 2){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + + a_offset1 += 2; + a_offset2 += 2; + b_offset1 += ldb*2; + + } + + + if (cols & 1){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + } + } // if (rows & 2) + + + if (rows & 1) { + a_offset1 = a_offset; + a_offset += lda; + + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + + j = (cols >> 2); + if (j > 0){ + do { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + a_offset1 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } + + if (cols & 2){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + a_offset1 += 2; + b_offset1 += ldb * 2; + } + + if (cols & 1){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + } + } + + return 0; +} + +#endif diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c index d0cea9bee..a021741c7 100644 --- a/kernel/x86_64/sasum.c +++ b/kernel/x86_64/sasum.c @@ -11,7 +11,7 @@ #if defined(SKYLAKEX) #include "sasum_microk_skylakex-2.c" -#elif defined(HASWELL) +#elif defined(HASWELL) || defined(ZEN) #include "sasum_microk_haswell-2.c" #endif diff --git a/kernel/x86_64/sbgemm_block_microk_cooperlake.c b/kernel/x86_64/sbgemm_block_microk_cooperlake.c new file mode 100644 index 000000000..2376fed02 --- /dev/null +++ b/kernel/x86_64/sbgemm_block_microk_cooperlake.c @@ -0,0 +1,426 @@ +#include "sbgemm.h" + +#include +// Walk around those intrinsics that missed by compiler +#define MM256_LOADU_EPI16(addr) \ + _mm256_maskz_loadu_epi16(~0, (addr)) +#define MM256_STOREU_EPI16(addr, reg) \ + _mm256_mask_storeu_epi16((addr), ~0, (reg)) + +#include +void print_block(BLASLONG m, BLASLONG n, bfloat16 * mat) +{ + printf("---- BLOCK %ld x %ld ----\n", m, n); + for (BLASLONG i=0; i> (32-m)); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + + __m512i array512_0, array512_1, array512_2, array512_3; + + BLASLONG idx_src_base0, idx_src_base1; + BLASLONG idx_target_base0, idx_target_base1; + + BLASLONG LDA_2x = 2*lda; + BLASLONG BF16_BLOCK_T_M_2x = 2*32; + idx_src_base0 = 0; + idx_src_base1 = lda; + idx_target_base0 = 0; + idx_target_base1 = 32; + for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { + array512_0 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); + array512_1 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base1]); + array512_2 = _mm512_unpacklo_epi16(array512_0, array512_1); + array512_3 = _mm512_unpackhi_epi16(array512_0, array512_1); + _mm512_storeu_si512(&block_A[idx_target_base0], array512_2); + _mm512_storeu_si512(&block_A[idx_target_base1], array512_3); + + idx_src_base0 += LDA_2x; + idx_src_base1 += LDA_2x; + idx_target_base0 += BF16_BLOCK_T_M_2x; + idx_target_base1 += BF16_BLOCK_T_M_2x; + } + + if (tag_k_2x != k) { + __m512i ZERO512 = _mm512_setzero_si512(); + array512_0 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); + array512_2 = _mm512_unpacklo_epi16(array512_0, ZERO512); + array512_3 = _mm512_unpackhi_epi16(array512_0, ZERO512); + _mm512_storeu_si512(&block_A[idx_target_base0], array512_2); + _mm512_storeu_si512(&block_A[idx_target_base1], array512_3); + } + +#ifdef DEBUG_PROFILE + print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A); +#endif +} + +void COL_MAJOR_INCOPY_KERNEL_Kx16(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) +{ + BLASLONG tag_k_2x = k & (~1); + + __m256i array256_0, array256_1, array256_2, array256_3; + + BLASLONG idx_src_base0, idx_src_base1; + BLASLONG idx_target_base0; + + BLASLONG LDA_2x = 2*lda; + idx_src_base0 = 0; + idx_src_base1 = lda; + idx_target_base0 = 0; + for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { + array256_0 = MM256_LOADU_EPI16(&A[idx_src_base0]); + array256_1 = MM256_LOADU_EPI16(&A[idx_src_base1]); + array256_2 = _mm256_unpacklo_epi16(array256_0, array256_1); + array256_3 = _mm256_unpackhi_epi16(array256_0, array256_1); + // Store in one row of block_B + MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); + MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); + + idx_src_base0 += LDA_2x; + idx_src_base1 += LDA_2x; + idx_target_base0 += 32; + } + + if (tag_k_2x != k) { + __m256i ZERO256 = _mm256_setzero_si256(); + array256_0 = MM256_LOADU_EPI16(&A[idx_src_base0]); + array256_2 = _mm256_unpacklo_epi16(array256_0, ZERO256); + array256_3 = _mm256_unpackhi_epi16(array256_0, ZERO256); + // Store in one row of block_B + MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); + MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); + } + +#ifdef DEBUG_PROFILE + print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A); +#endif +} + +void COL_MAJOR_INCOPY_KERNEL_Kx16m(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) +{ + BLASLONG tag_k_2x = k & (~1); + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + + __m256i array256_0, array256_1, array256_2, array256_3; + + BLASLONG idx_src_base0, idx_src_base1; + BLASLONG idx_target_base0; + + BLASLONG LDA_2x = 2*lda; + idx_src_base0 = 0; + idx_src_base1 = lda; + idx_target_base0 = 0; + for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { + array256_0 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); + array256_1 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base1]); + array256_2 = _mm256_unpacklo_epi16(array256_0, array256_1); + array256_3 = _mm256_unpackhi_epi16(array256_0, array256_1); + // Store in one row of block_B + MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); + MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); + + idx_src_base0 += LDA_2x; + idx_src_base1 += LDA_2x; + idx_target_base0 += 32; + } + + if (tag_k_2x != k) { + __m256i ZERO256 = _mm256_setzero_si256(); + array256_0 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); + array256_2 = _mm256_unpacklo_epi16(array256_0, ZERO256); + array256_3 = _mm256_unpackhi_epi16(array256_0, ZERO256); + // Store in one row of block_B + MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); + MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); + } + +#ifdef DEBUG_PROFILE + print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A); +#endif +} + +void COL_MAJOR_ONCOPY_KERNEL_8x32(BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) +{ + BLASLONG tag_k_32x = k & (~31); + BLASLONG idx_src_base0, idx_src_base1, idx_src_base2, idx_src_base3, idx_src_base4, idx_src_base5, idx_src_base6, idx_src_base7; + BLASLONG idx_target_base0; + + idx_src_base0 = 0; + idx_src_base1 = 1*ldb; + idx_src_base2 = 2*ldb; + idx_src_base3 = 3*ldb; + idx_src_base4 = 4*ldb; + idx_src_base5 = 5*ldb; + idx_src_base6 = 6*ldb; + idx_src_base7 = 7*ldb; + idx_target_base0 = 0; + + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_loadu_si512(&B[idx_src_base0+idx_k])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_loadu_si512(&B[idx_src_base1+idx_k])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*2], _mm512_loadu_si512(&B[idx_src_base2+idx_k])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*3], _mm512_loadu_si512(&B[idx_src_base3+idx_k])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*4], _mm512_loadu_si512(&B[idx_src_base4+idx_k])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*5], _mm512_loadu_si512(&B[idx_src_base5+idx_k])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*6], _mm512_loadu_si512(&B[idx_src_base6+idx_k])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*7], _mm512_loadu_si512(&B[idx_src_base7+idx_k])); + idx_target_base0 += 32*8; + } + + if (tag_k_32x != k) { + unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x))); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0+tag_k_32x])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base1+tag_k_32x])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*2], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base2+tag_k_32x])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*3], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base3+tag_k_32x])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*4], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base4+tag_k_32x])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*5], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base5+tag_k_32x])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*6], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base6+tag_k_32x])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*7], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base7+tag_k_32x])); + } + +#ifdef DEBUG_PROFILE + print_block(BF16_BLOCK_THRES_N, BF16_BLOCK_THRES_K, block_B); +#endif +} + +void COL_MAJOR_ONCOPY_KERNEL_Nx32(BLASLONG n, BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) +{ + BLASLONG tag_k_32x = k & (~31); + BLASLONG tag_n_2x = n & (~1); + BLASLONG idx_src_base0; + BLASLONG idx_target_base0; + + BLASLONG LDB_2x = 2*ldb; + + idx_target_base0 = 0; + + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { + idx_src_base0 = 0; + for (BLASLONG idx_n = 0; idx_n < tag_n_2x; idx_n += 2) { + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_loadu_si512(&B[idx_src_base0 + idx_k])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_loadu_si512(&B[idx_src_base0 + ldb + idx_k])); + idx_src_base0 += LDB_2x; + idx_target_base0 += 64; + } + + if (tag_n_2x != n) { + _mm512_storeu_si512(&block_B[idx_target_base0], _mm512_loadu_si512(&B[idx_src_base0 + idx_k])); + idx_target_base0 += 32; + } + } + + if (tag_k_32x != k) { + unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x))); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + idx_src_base0 = 0; + for (BLASLONG idx_n = 0; idx_n < tag_n_2x; idx_n += 2) { + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + tag_k_32x])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + ldb + tag_k_32x])); + idx_src_base0 += LDB_2x; + idx_target_base0 += 64; + } + + if (tag_n_2x != n) { + _mm512_storeu_si512(&block_B[idx_target_base0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + tag_k_32x])); + } + } + +#ifdef DEBUG_PROFILE + print_block(BF16_BLOCK_THRES_N, BF16_BLOCK_THRES_K, block_B); +#endif +} + +// Scale matrix C while beta is not ZERO or ONE +void sbgemm_scal_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc) +{ + BLASLONG tag_n_Nx = N & (~3); + BLASLONG tag_n_Mx = M & (~15); + + BLASLONG LDC4x = ldc*4; + BLASLONG idx_base_0 = 0; + BLASLONG idx_base_1 = ldc; + BLASLONG idx_base_2 = ldc*2; + BLASLONG idx_base_3 = ldc*3; + + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-M+tag_n_Mx)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + + __m512 array_512_0, array_512_1, array_512_2, array_512_3; + + __m512 BETAVECTOR = _mm512_set1_ps(beta); + + if (Order == CblasColMajor) { + for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) { + for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { + array_512_0 = _mm512_loadu_ps(&C[idx_base_0+idx_m]); + array_512_1 = _mm512_loadu_ps(&C[idx_base_1+idx_m]); + array_512_2 = _mm512_loadu_ps(&C[idx_base_2+idx_m]); + array_512_3 = _mm512_loadu_ps(&C[idx_base_3+idx_m]); + + array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); + array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1); + array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2); + array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3); + + _mm512_storeu_ps(&C[idx_base_0+idx_m], array_512_0); + _mm512_storeu_ps(&C[idx_base_1+idx_m], array_512_1); + _mm512_storeu_ps(&C[idx_base_2+idx_m], array_512_2); + _mm512_storeu_ps(&C[idx_base_3+idx_m], array_512_3); + } + + if (tag_n_Mx != M) { + array_512_0 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_0+tag_n_Mx]); + array_512_1 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_1+tag_n_Mx]); + array_512_2 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_2+tag_n_Mx]); + array_512_3 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_3+tag_n_Mx]); + + array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); + array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1); + array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2); + array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3); + + _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, array_512_0); + _mm512_mask_storeu_ps(&C[idx_base_1+tag_n_Mx], tail_mask, array_512_1); + _mm512_mask_storeu_ps(&C[idx_base_2+tag_n_Mx], tail_mask, array_512_2); + _mm512_mask_storeu_ps(&C[idx_base_3+tag_n_Mx], tail_mask, array_512_3); + } + + idx_base_0 += LDC4x; + idx_base_1 += LDC4x; + idx_base_2 += LDC4x; + idx_base_3 += LDC4x; + } + + if (tag_n_Nx != N) { + for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) { + for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { + array_512_0 = _mm512_loadu_ps(&C[idx_base_0+idx_m]); + array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); + _mm512_storeu_ps(&C[idx_base_0+idx_m], array_512_0); + } + + if (tag_n_Mx != M) { + array_512_0 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_0+tag_n_Mx]); + array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); + _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, array_512_0); + } + idx_base_0 += ldc; + } + } + } else { + + } +} + +// Scale matrix C while beta is not ZERO or ONE +void sbgemm_zero_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, float *C, OPENBLAS_CONST blasint ldc) +{ + BLASLONG tag_n_Nx = N & (~3); + BLASLONG tag_n_Mx = M & (~15); + + BLASLONG LDC4x = ldc*4; + BLASLONG idx_base_0 = 0; + BLASLONG idx_base_1 = ldc; + BLASLONG idx_base_2 = ldc*2; + BLASLONG idx_base_3 = ldc*3; + + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-M+tag_n_Mx)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + + __m512 ZEROVECTOR = _mm512_setzero_ps(); + + if (Order == CblasColMajor) { + for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) { + for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { + _mm512_storeu_ps(&C[idx_base_0+idx_m], ZEROVECTOR); + _mm512_storeu_ps(&C[idx_base_1+idx_m], ZEROVECTOR); + _mm512_storeu_ps(&C[idx_base_2+idx_m], ZEROVECTOR); + _mm512_storeu_ps(&C[idx_base_3+idx_m], ZEROVECTOR); + } + + if (tag_n_Mx != M) { + _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, ZEROVECTOR); + _mm512_mask_storeu_ps(&C[idx_base_1+tag_n_Mx], tail_mask, ZEROVECTOR); + _mm512_mask_storeu_ps(&C[idx_base_2+tag_n_Mx], tail_mask, ZEROVECTOR); + _mm512_mask_storeu_ps(&C[idx_base_3+tag_n_Mx], tail_mask, ZEROVECTOR); + } + + idx_base_0 += LDC4x; + idx_base_1 += LDC4x; + idx_base_2 += LDC4x; + idx_base_3 += LDC4x; + } + + if (tag_n_Nx != N) { + for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) { + for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { + _mm512_storeu_ps(&C[idx_base_0+idx_m], ZEROVECTOR); + } + + if (tag_n_Mx != M) { + _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, ZEROVECTOR); + } + idx_base_0 += ldc; + } + } + } else { + + } +} \ No newline at end of file diff --git a/kernel/x86_64/sbgemm_microk_cooperlake_template.c b/kernel/x86_64/sbgemm_microk_cooperlake_template.c new file mode 100644 index 000000000..dd4cb440b --- /dev/null +++ b/kernel/x86_64/sbgemm_microk_cooperlake_template.c @@ -0,0 +1,625 @@ +#include "sbgemm.h" +#include "bf16_common_macros.h" +#include + +#undef STORE16_COMPLETE_RESULT +#undef STORE16_MASK_COMPLETE_RESULT +#undef SBGEMM_BLOCK_KERNEL_32x8x32 +#undef SBGEMM_BLOCK_KERNEL_16x8x32 +#undef SBGEMM_BLOCK_KERNEL_32xNx32 +#undef SBGEMM_BLOCK_KERNEL_16xNx32 +#undef SBGEMM_BLOCKING_KERNEL_2 + +#ifndef ONE_ALPHA // ALPHA is not ONE + #define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_ONE + #define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE + #define SBGEMM_BLOCK_KERNEL_32x8x32 sbgemm_block_kernel_32x8x32_alpha + #define SBGEMM_BLOCK_KERNEL_16x8x32 sbgemm_block_kernel_16x8x32_alpha + #define SBGEMM_BLOCK_KERNEL_32xNx32 sbgemm_block_kernel_32xNx32_alpha + #define SBGEMM_BLOCK_KERNEL_16xNx32 sbgemm_block_kernel_16xNx32_alpha + #define SBGEMM_BLOCKING_KERNEL_2 sbgemm_blocking_kernel_2_alpha +#else // ALPHA is ONE + #define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ONE_ONE + #define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ONE_ONE + #define SBGEMM_BLOCK_KERNEL_32x8x32 sbgemm_block_kernel_32x8x32_one + #define SBGEMM_BLOCK_KERNEL_16x8x32 sbgemm_block_kernel_16x8x32_one + #define SBGEMM_BLOCK_KERNEL_32xNx32 sbgemm_block_kernel_32xNx32_one + #define SBGEMM_BLOCK_KERNEL_16xNx32 sbgemm_block_kernel_16xNx32_one + #define SBGEMM_BLOCKING_KERNEL_2 sbgemm_blocking_kernel_2_one +#endif + + +// SBGEMM Kernel for 16> (32-m)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base0, result_512_8); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base1, result_512_8); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base0, result_512_9); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base1, result_512_9); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*0])) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*0+16]), tail_mask) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*1])) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*1+16]), tail_mask) + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base0, result_512_10); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base1, result_512_10); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base0, result_512_11); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base1, result_512_11); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*2])) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*2+16]), tail_mask) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*3])) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*3+16]), tail_mask) + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base0, result_512_12); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base1, result_512_12); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base0, result_512_13); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base1, result_512_13); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*4])) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*4+16]), tail_mask) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*5])) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*5+16]), tail_mask) + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base0, result_512_14); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base1, result_512_14); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base0, result_512_15); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base1, result_512_15); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*6])) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*6+16]), tail_mask) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*7])) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*7+16]), tail_mask) + } else { + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base0, result_512_8); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base1, result_512_8); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base0, result_512_9); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base1, result_512_9); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*0])) + STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*0+16])) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*1])) + STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*1+16])) + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base0, result_512_10); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base1, result_512_10); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base0, result_512_11); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base1, result_512_11); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*2])) + STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*2+16])) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*3])) + STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*3+16])) + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base0, result_512_12); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base1, result_512_12); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base0, result_512_13); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base1, result_512_13); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*4])) + STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*4+16])) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*5])) + STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*5+16])) + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base0, result_512_14); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base1, result_512_14); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base0, result_512_15); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base1, result_512_15); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*6])) + STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*6+16])) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*7])) + STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*7+16])) + } +} + +// SBGEMM Kernel for M<=16, N=8, K can be any number, but the processing will take 32 as a base +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_block_kernel_16x8x32_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#else // ALPHA is ONE +void sbgemm_block_kernel_16x8x32_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#endif +{ + int SHUFFLE_MAGIC_NO = 0x39; + BLASLONG tag_k_32x = k & (~31); + BLASLONG idxB_base = 0; + BLASLONG width = 32; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif + + __m512i arrayA_512_0; + __m512i arrayB_512_0, arrayB_512_1, arrayB_512_2, arrayB_512_3, arrayB_512_4, arrayB_512_5, arrayB_512_6, arrayB_512_7; + __m512 result_512_0, result_512_1, result_512_2, result_512_3, result_512_4, result_512_5, result_512_6, result_512_7; + + result_512_0 = _mm512_setzero_ps(); + result_512_1 = _mm512_setzero_ps(); + result_512_2 = _mm512_setzero_ps(); + result_512_3 = _mm512_setzero_ps(); + result_512_4 = _mm512_setzero_ps(); + result_512_5 = _mm512_setzero_ps(); + result_512_6 = _mm512_setzero_ps(); + result_512_7 = _mm512_setzero_ps(); + + for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) { + // Load B with unroll 8 + idxB_base = idx_k << 3; + arrayB_512_0 = _mm512_loadu_si512(&B[idxB_base + 32*0]); + arrayB_512_1 = _mm512_loadu_si512(&B[idxB_base + 32*1]); + arrayB_512_2 = _mm512_loadu_si512(&B[idxB_base + 32*2]); + arrayB_512_3 = _mm512_loadu_si512(&B[idxB_base + 32*3]); + arrayB_512_4 = _mm512_loadu_si512(&B[idxB_base + 32*4]); + arrayB_512_5 = _mm512_loadu_si512(&B[idxB_base + 32*5]); + arrayB_512_6 = _mm512_loadu_si512(&B[idxB_base + 32*6]); + arrayB_512_7 = _mm512_loadu_si512(&B[idxB_base + 32*7]); + + if (idx_k == tag_k_32x) {width = k - tag_k_32x;} + + for (BLASLONG idx = 0; idx < width;) { + // Each two rows are a group for 32-pair bf16 elements + // Load two rows into a 512 register + arrayA_512_0 = _mm512_loadu_si512(&A[idx<<4]); + + result_512_0 = _mm512_dpbf16_ps(result_512_0, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_0))); + result_512_1 = _mm512_dpbf16_ps(result_512_1, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_1))); + result_512_2 = _mm512_dpbf16_ps(result_512_2, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_2))); + result_512_3 = _mm512_dpbf16_ps(result_512_3, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_3))); + result_512_4 = _mm512_dpbf16_ps(result_512_4, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_4))); + result_512_5 = _mm512_dpbf16_ps(result_512_5, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_5))); + result_512_6 = _mm512_dpbf16_ps(result_512_6, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_6))); + result_512_7 = _mm512_dpbf16_ps(result_512_7, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_7))); + + arrayB_512_0 = _mm512_shuffle_epi32(arrayB_512_0, SHUFFLE_MAGIC_NO); + arrayB_512_1 = _mm512_shuffle_epi32(arrayB_512_1, SHUFFLE_MAGIC_NO); + arrayB_512_2 = _mm512_shuffle_epi32(arrayB_512_2, SHUFFLE_MAGIC_NO); + arrayB_512_3 = _mm512_shuffle_epi32(arrayB_512_3, SHUFFLE_MAGIC_NO); + arrayB_512_4 = _mm512_shuffle_epi32(arrayB_512_4, SHUFFLE_MAGIC_NO); + arrayB_512_5 = _mm512_shuffle_epi32(arrayB_512_5, SHUFFLE_MAGIC_NO); + arrayB_512_6 = _mm512_shuffle_epi32(arrayB_512_6, SHUFFLE_MAGIC_NO); + arrayB_512_7 = _mm512_shuffle_epi32(arrayB_512_7, SHUFFLE_MAGIC_NO); + + idx += 2; + // Every 4 loops we need to switch to next 128 bits of arrayB registers + if ((idx & (~7)) == idx) { + arrayB_512_0 = _mm512_shuffle_i32x4(arrayB_512_0, arrayB_512_0, SHUFFLE_MAGIC_NO); + arrayB_512_1 = _mm512_shuffle_i32x4(arrayB_512_1, arrayB_512_1, SHUFFLE_MAGIC_NO); + arrayB_512_2 = _mm512_shuffle_i32x4(arrayB_512_2, arrayB_512_2, SHUFFLE_MAGIC_NO); + arrayB_512_3 = _mm512_shuffle_i32x4(arrayB_512_3, arrayB_512_3, SHUFFLE_MAGIC_NO); + arrayB_512_4 = _mm512_shuffle_i32x4(arrayB_512_4, arrayB_512_4, SHUFFLE_MAGIC_NO); + arrayB_512_5 = _mm512_shuffle_i32x4(arrayB_512_5, arrayB_512_5, SHUFFLE_MAGIC_NO); + arrayB_512_6 = _mm512_shuffle_i32x4(arrayB_512_6, arrayB_512_6, SHUFFLE_MAGIC_NO); + arrayB_512_7 = _mm512_shuffle_i32x4(arrayB_512_7, arrayB_512_7, SHUFFLE_MAGIC_NO); + } + } + } + + if (m != 16) { + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + + result_512_0 = _mm512_shuffle_f32x4(result_512_0, result_512_0, 0xd8); + result_512_1 = _mm512_shuffle_f32x4(result_512_1, result_512_1, 0xd8); + result_512_2 = _mm512_shuffle_f32x4(result_512_2, result_512_2, 0xd8); + result_512_3 = _mm512_shuffle_f32x4(result_512_3, result_512_3, 0xd8); + STORE16_MASK_COMPLETE_RESULT(result_512_0, (&C[ldc*0]), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_1, (&C[ldc*1]), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_2, (&C[ldc*2]), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_3, (&C[ldc*3]), tail_mask) + result_512_4 = _mm512_shuffle_f32x4(result_512_4, result_512_4, 0xd8); + result_512_5 = _mm512_shuffle_f32x4(result_512_5, result_512_5, 0xd8); + result_512_6 = _mm512_shuffle_f32x4(result_512_6, result_512_6, 0xd8); + result_512_7 = _mm512_shuffle_f32x4(result_512_7, result_512_7, 0xd8); + STORE16_MASK_COMPLETE_RESULT(result_512_4, (&C[ldc*4]), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_5, (&C[ldc*5]), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_6, (&C[ldc*6]), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_7, (&C[ldc*7]), tail_mask) + } else { + result_512_0 = _mm512_shuffle_f32x4(result_512_0, result_512_0, 0xd8); + result_512_1 = _mm512_shuffle_f32x4(result_512_1, result_512_1, 0xd8); + result_512_2 = _mm512_shuffle_f32x4(result_512_2, result_512_2, 0xd8); + result_512_3 = _mm512_shuffle_f32x4(result_512_3, result_512_3, 0xd8); + STORE16_COMPLETE_RESULT(result_512_0, (&C[ldc*0])) + STORE16_COMPLETE_RESULT(result_512_1, (&C[ldc*1])) + STORE16_COMPLETE_RESULT(result_512_2, (&C[ldc*2])) + STORE16_COMPLETE_RESULT(result_512_3, (&C[ldc*3])) + result_512_4 = _mm512_shuffle_f32x4(result_512_4, result_512_4, 0xd8); + result_512_5 = _mm512_shuffle_f32x4(result_512_5, result_512_5, 0xd8); + result_512_6 = _mm512_shuffle_f32x4(result_512_6, result_512_6, 0xd8); + result_512_7 = _mm512_shuffle_f32x4(result_512_7, result_512_7, 0xd8); + STORE16_COMPLETE_RESULT(result_512_4, (&C[ldc*4])) + STORE16_COMPLETE_RESULT(result_512_5, (&C[ldc*5])) + STORE16_COMPLETE_RESULT(result_512_6, (&C[ldc*6])) + STORE16_COMPLETE_RESULT(result_512_7, (&C[ldc*7])) + } +} + +// SBGEMM Kernel for 16> (32-m)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + for (int i = 0; i < n; i++) { + result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*i])) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*i+16]), tail_mask) + } + } else { + for (int i = 0; i < n; i++) { + result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*i])) + STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*i+16])) + } + } +} + +// SBGEMM Kernel for 16<=M, N<8, K can be any number, but the processing will take 32 as a base +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_block_kernel_16xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#else // ALPHA is ONE +void sbgemm_block_kernel_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#endif +{ + int SHUFFLE_MAGIC_NO = 0x39; + BLASLONG tag_k_32x = k & (~31); + BLASLONG idxB_base = 0; + BLASLONG width = 32; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif + + __m512i arrayA_512; + __m512i arrayB_512[8]; + __m512 result_512[8]; + + for (int i = 0; i < 8; i += 2) { + result_512[i] = _mm512_setzero_ps(); + result_512[i+1] = _mm512_setzero_ps(); + } + + for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) { + // Load B with unroll n + for (int i = 0; i < n; i ++) { + arrayB_512[i] = _mm512_loadu_si512(&B[idxB_base]); + idxB_base += 32; + } + + if (idx_k == tag_k_32x) {width = k - tag_k_32x;} + + for (BLASLONG idx = 0; idx < width;) { + // Each two rows are a group for 32-pair bf16 elements + // Load two rows into a 512 register + arrayA_512 = _mm512_loadu_si512(&A[idx<<4]); + + for (int i = 0; i < n; i ++) { + result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i]))); + arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO); + } + + idx += 2; + // Every 4 loops we need to switch to next 128 bits of arrayB registers + if ((idx & (~7)) == idx) { + for (int i = 0; i < n; i++) { + arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO); + } + } + } + } + + if (m != 16) { + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + for (int i = 0; i < n; i++) { + result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8); + STORE16_MASK_COMPLETE_RESULT(result_512[i], (&C[ldc*i]), tail_mask) + } + } else { + for (int i = 0; i < n; i++) { + result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8); + STORE16_COMPLETE_RESULT(result_512[i], (&C[ldc*i])) + } + } +} +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_blocking_kernel_2_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#else // ALPHA is ONE +void sbgemm_blocking_kernel_2_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#endif +{ + BLASLONG m_step, n_step, k_step, k_step_round32; + BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1)); + + BLASLONG n_from, n_to; + BLASLONG tag_n_Nx; + + n_from = 0; + n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + + k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + + if (M >= BF16_BLOCK_THRES_M) { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, &A(idx_k, 0), lda, block_A); + // TODO: MT + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_32x8x32(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) { + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, &A(idx_k, idx_m), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_32x8x32(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc); + } + } + + if (tag_m_Nx != M) { + m_step = M - tag_m_Nx; + if (m_step > 16) { + COL_MAJOR_INCOPY_KERNEL_Kx32m(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_32x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } else if (m_step == 16) { + COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_16x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } else { + COL_MAJOR_INCOPY_KERNEL_Kx16m(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_16x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } else { + m_step = M - tag_m_Nx; + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_INCOPY_KERNEL_Kx32m(k_step, m_step, &A(idx_k, 0), lda, block_A); + // TODO: MT + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_32x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } +} + +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_internal_kernel_alpha(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, + OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc) +#else // ALPHA is ONE +void sbgemm_internal_kernel_one(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, + OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc) +#endif +{ + bfloat16 block_A[BF16_BLOCK_THRES_K * BF16_BLOCK_THRES_M]; + bfloat16 block_B[BF16_BLOCK_THRES_N * BF16_BLOCK_THRES_K]; + + // TODO: assume no trans for both A and B, to complement these scenarios later + if (Order == CblasColMajor) { + SBGEMM_BLOCKING_KERNEL_2(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); + } else { + + } +} \ No newline at end of file diff --git a/kernel/x86_64/sgemm_direct_skylakex.c b/kernel/x86_64/sgemm_direct_skylakex.c index aaadcf151..cc2ac5553 100644 --- a/kernel/x86_64/sgemm_direct_skylakex.c +++ b/kernel/x86_64/sgemm_direct_skylakex.c @@ -1,8 +1,11 @@ /* the direct sgemm code written by Arjan van der Ven */ + + +#if defined(SKYLAKEX) || defined (COOPERLAKE) + #include #include "common.h" -#if defined(SKYLAKEX) || defined (COOPERLAKE) /* * "Direct sgemm" code. This code operates directly on the inputs and outputs * of the sgemm call, avoiding the copies, memory realignments and threading, diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c index 3de586cb8..587cf8e40 100644 --- a/kernel/x86_64/srot.c +++ b/kernel/x86_64/srot.c @@ -2,7 +2,7 @@ #if defined(SKYLAKEX) #include "srot_microk_skylakex-2.c" -#elif defined(HASWELL) +#elif defined(HASWELL) || defined(ZEN) #include "srot_microk_haswell-2.c" #endif @@ -13,7 +13,7 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) { BLASLONG i = 0; -#if V_SIMD && (defined(HAVE_FMA3) || V_SIMD > 128) +#if V_SIMD && !defined(C_PGI) && (defined(HAVE_FMA3) || V_SIMD > 128) const int vstep = v_nlanes_f32; const int unrollx4 = n & (-vstep * 4); const int unrollx = n & -vstep; diff --git a/kernel/x86_64/srot_microk_haswell-2.c b/kernel/x86_64/srot_microk_haswell-2.c index 8e245cc8f..b5545726e 100644 --- a/kernel/x86_64/srot_microk_haswell-2.c +++ b/kernel/x86_64/srot_microk_haswell-2.c @@ -1,5 +1,4 @@ -/* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#if defined(HAVE_FMA3) && defined(HAVE_AVX2) #define HAVE_SROT_KERNEL 1 diff --git a/lapack-netlib/SRC/chgeqz.f b/lapack-netlib/SRC/chgeqz.f index 73d35621c..4725e7169 100644 --- a/lapack-netlib/SRC/chgeqz.f +++ b/lapack-netlib/SRC/chgeqz.f @@ -320,12 +320,13 @@ $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP COMPLEX ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, - $ U12, X + $ U12, X, ABI12, Y * .. * .. External Functions .. + COMPLEX CLADIV LOGICAL LSAME REAL CLANHS, SLAMCH - EXTERNAL LSAME, CLANHS, SLAMCH + EXTERNAL CLADIV, LLSAME, CLANHS, SLAMCH * .. * .. External Subroutines .. EXTERNAL CLARTG, CLASET, CROT, CSCAL, XERBLA @@ -729,22 +730,34 @@ AD22 = ( ASCALE*H( ILAST, ILAST ) ) / $ ( BSCALE*T( ILAST, ILAST ) ) ABI22 = AD22 - U12*AD21 + ABI12 = AD12 - U12*AD11 * - T1 = HALF*( AD11+ABI22 ) - RTDISC = SQRT( T1**2+AD12*AD21-AD11*AD22 ) - TEMP = REAL( T1-ABI22 )*REAL( RTDISC ) + - $ AIMAG( T1-ABI22 )*AIMAG( RTDISC ) - IF( TEMP.LE.ZERO ) THEN - SHIFT = T1 + RTDISC - ELSE - SHIFT = T1 - RTDISC + SHIFT = ABI22 + CTEMP = SQRT( ABI12 )*SQRT( AD21 ) + TEMP = ABS1( CTEMP ) + IF( CTEMP.NE.ZERO ) THEN + X = HALF*( AD11-SHIFT ) + TEMP2 = ABS1( X ) + TEMP = MAX( TEMP, ABS1( X ) ) + Y = TEMP*SQRT( ( X / TEMP )**2+( CTEMP / TEMP )**2 ) + IF( TEMP2.GT.ZERO ) THEN + IF( REAL( X / TEMP2 )*REAL( Y )+ + $ AIMAG( X / TEMP2 )*AIMAG( Y ).LT.ZERO )Y = -Y + END IF + SHIFT = SHIFT - CTEMP*CLADIV( CTEMP, ( X+Y ) ) END IF ELSE * * Exceptional shift. Chosen for no particularly good reason. * - ESHIFT = ESHIFT + (ASCALE*H(ILAST,ILAST-1))/ - $ (BSCALE*T(ILAST-1,ILAST-1)) + IF( ( IITER / 20 )*20.EQ.IITER .AND. + $ BSCALE*ABS1(T( ILAST, ILAST )).GT.SAFMIN ) THEN + ESHIFT = ESHIFT + ( ASCALE*H( ILAST, + $ ILAST ) )/( BSCALE*T( ILAST, ILAST ) ) + ELSE + ESHIFT = ESHIFT + ( ASCALE*H( ILAST, + $ ILAST-1 ) )/( BSCALE*T( ILAST-1, ILAST-1 ) ) + END IF SHIFT = ESHIFT END IF * diff --git a/lapack-netlib/SRC/zhgeqz.f b/lapack-netlib/SRC/zhgeqz.f index b51cba4f7..b28ae47a4 100644 --- a/lapack-netlib/SRC/zhgeqz.f +++ b/lapack-netlib/SRC/zhgeqz.f @@ -320,12 +320,13 @@ $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP COMPLEX*16 ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, - $ U12, X + $ U12, X, ABI12, Y * .. * .. External Functions .. + COMPLEX*16 ZLADIV LOGICAL LSAME DOUBLE PRECISION DLAMCH, ZLANHS - EXTERNAL LSAME, DLAMCH, ZLANHS + EXTERNAL ZLADIV, LSAME, DLAMCH, ZLANHS * .. * .. External Subroutines .. EXTERNAL XERBLA, ZLARTG, ZLASET, ZROT, ZSCAL @@ -730,22 +731,34 @@ AD22 = ( ASCALE*H( ILAST, ILAST ) ) / $ ( BSCALE*T( ILAST, ILAST ) ) ABI22 = AD22 - U12*AD21 + ABI12 = AD12 - U12*AD11 * - T1 = HALF*( AD11+ABI22 ) - RTDISC = SQRT( T1**2+AD12*AD21-AD11*AD22 ) - TEMP = DBLE( T1-ABI22 )*DBLE( RTDISC ) + - $ DIMAG( T1-ABI22 )*DIMAG( RTDISC ) - IF( TEMP.LE.ZERO ) THEN - SHIFT = T1 + RTDISC - ELSE - SHIFT = T1 - RTDISC + SHIFT = ABI22 + CTEMP = SQRT( ABI12 )*SQRT( AD21 ) + TEMP = ABS1( CTEMP ) + IF( CTEMP.NE.ZERO ) THEN + X = HALF*( AD11-SHIFT ) + TEMP2 = ABS1( X ) + TEMP = MAX( TEMP, ABS1( X ) ) + Y = TEMP*SQRT( ( X / TEMP )**2+( CTEMP / TEMP )**2 ) + IF( TEMP2.GT.ZERO ) THEN + IF( DBLE( X / TEMP2 )*DBLE( Y )+ + $ DIMAG( X / TEMP2 )*DIMAG( Y ).LT.ZERO )Y = -Y + END IF + SHIFT = SHIFT - CTEMP*ZLADIV( CTEMP, ( X+Y ) ) END IF ELSE * * Exceptional shift. Chosen for no particularly good reason. * - ESHIFT = ESHIFT + (ASCALE*H(ILAST,ILAST-1))/ - $ (BSCALE*T(ILAST-1,ILAST-1)) + IF( ( IITER / 20 )*20.EQ.IITER .AND. + $ BSCALE*ABS1(T( ILAST, ILAST )).GT.SAFMIN ) THEN + ESHIFT = ESHIFT + ( ASCALE*H( ILAST, + $ ILAST ) )/( BSCALE*T( ILAST, ILAST ) ) + ELSE + ESHIFT = ESHIFT + ( ASCALE*H( ILAST, + $ ILAST-1 ) )/( BSCALE*T( ILAST-1, ILAST-1 ) ) + END IF SHIFT = ESHIFT END IF * diff --git a/lapack-netlib/TESTING/CMakeLists.txt b/lapack-netlib/TESTING/CMakeLists.txt index 80e6b3232..b4e2223f7 100644 --- a/lapack-netlib/TESTING/CMakeLists.txt +++ b/lapack-netlib/TESTING/CMakeLists.txt @@ -174,7 +174,20 @@ if(PYTHONINTERP_FOUND) endif() - +if(WIN32) +FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1 +"if (Test-Path $args[2]) { Remove-Item -Force $args[2] } \n" +"$ErrorActionPreference = \"Stop\"\n" +"Get-Content $args[1] | & \"$($args[0]).exe\" | Out-File $args[2]\n" +"If ((Get-Content $args[2] | %{$_ -match \"FATAL\"}) -contains $true) {\n" +"echo Error\n" +"exit 1\n" +"} else {\n" +"exit 0\n" +"}\n" +) +set(helper_prefix powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1") +else() # $1 exec, $2 input, $3 output_result FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh "rm -f $3\n" @@ -187,51 +200,52 @@ FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh "exit 0\n" "fi\n" ) - +set(helper_prefix sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh") +endif() add_test(NAME "REAL_LAPACK_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest.in" "${CMAKE_CURRENT_BINARY_DIR}/stest.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest.in" "${CMAKE_CURRENT_BINARY_DIR}/stest.out" ) add_test(NAME "COMPLEX_LAPACK_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest.in" "${CMAKE_CURRENT_BINARY_DIR}/ctest.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest.in" "${CMAKE_CURRENT_BINARY_DIR}/ctest.out" ) add_test(NAME "DOUBLE_PRECISION_LAPACK_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest.in" "${CMAKE_CURRENT_BINARY_DIR}/dtest.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest.in" "${CMAKE_CURRENT_BINARY_DIR}/dtest.out" ) add_test(NAME "COMPLEX16_LAPACK_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest.in" "${CMAKE_CURRENT_BINARY_DIR}/ztest.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest.in" "${CMAKE_CURRENT_BINARY_DIR}/ztest.out" ) add_test(NAME "SINGLE-DOUBLE_PRECISION_LAPACK_prototype_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstds" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dstest.in" " ${CMAKE_CURRENT_BINARY_DIR}/dstest.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstds" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dstest.in" " ${CMAKE_CURRENT_BINARY_DIR}/dstest.out" ) # ======== COMPLEX-COMPLEX16 LIN TESTS ======================== add_test(NAME "Testing_COMPLEX-COMPLEX16_LAPACK_prototype_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstzc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zctest.in" " ${CMAKE_CURRENT_BINARY_DIR}/zctest.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstzc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zctest.in" " ${CMAKE_CURRENT_BINARY_DIR}/zctest.out" ) # ======== SINGLE RFP LIN TESTS ======================== add_test(NAME "Testing_REAL_LAPACK_RFP_prototype_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfs" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest_rfp.in" "${CMAKE_CURRENT_BINARY_DIR}/stest_rfp.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfs" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest_rfp.in" "${CMAKE_CURRENT_BINARY_DIR}/stest_rfp.out" ) # ======== COMPLEX16 RFP LIN TESTS ======================== add_test(NAME "Testing_DOUBLE_PRECISION_LAPACK_RFP_prototype_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/dtest_rfp.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/dtest_rfp.out" ) # ======== COMPLEX16 RFP LIN TESTS ======================== add_test(NAME "Testing_COMPLEX_LAPACK_RFP_prototype_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ctest_rfp.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ctest_rfp.out" ) # ======== COMPLEX16 RFP LIN TESTS ======================== add_test(NAME "Testing_COMPLEX16_LAPACK_RFP_prototype_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ztest_rfp.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ztest_rfp.out" ) # # @@ -239,327 +253,327 @@ add_test(NAME "Testing_COMPLEX16_LAPACK_RFP_prototype_linear_equation_routines" # add_test(NAME "SNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/snep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/snep.out" ) add_test(NAME "SSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssep.out" ) add_test(NAME "SSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/sse2.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/sse2.out" ) add_test(NAME "SSVD:_Testing_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssvd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssvd.out" ) add_test(NAME "SSEC:_Testing_REAL_Eigen_Condition_Routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sec.in" " ${CMAKE_CURRENT_BINARY_DIR}/sec.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sec.in" " ${CMAKE_CURRENT_BINARY_DIR}/sec.out" ) add_test(NAME "SSEV:_Testing_REAL_Nonsymmetric_Eigenvalue_Driver" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sed.in" " ${CMAKE_CURRENT_BINARY_DIR}/sed.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sed.in" " ${CMAKE_CURRENT_BINARY_DIR}/sed.out" ) add_test(NAME "SGG:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgg.out" ) add_test(NAME "SGD:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgd.out" ) add_test(NAME "SSB:_Testing_REAL_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssb.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssb.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssb.out" ) add_test(NAME "SSG:_Testing_REAL_Symmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssg.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssg.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssg.out" ) add_test(NAME "SGEBAL:_Testing_the_balancing_of_a_REAL_general_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbal.out" ) add_test(NAME "SGEBAK:_Testing_the_back_transformation_of_a_REAL_balanced_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbak.out" ) add_test(NAME "SGGBAL:_Testing_the_balancing_of_a_pair_of_REAL_general_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbal.out" ) add_test(NAME "SGGBAK:_Testing_the_back_transformation_of_a_pair_of_REAL_balanced_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbak.out" ) add_test(NAME "SBB:_Testing_banded_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbb.out" ) add_test(NAME "SGLM:_Testing_Generalized_Linear_Regression_Model_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/sglm.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/sglm.out" ) add_test(NAME "SGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgqr.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgqr.out" ) add_test(NAME "SGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" "${CMAKE_CURRENT_BINARY_DIR}/sgsv.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" "${CMAKE_CURRENT_BINARY_DIR}/sgsv.out" ) add_test(NAME "SCSD:_Testing_CS_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/scsd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/scsd.out" ) add_test(NAME "SLSE:_Testing_Constrained_Linear_Least_Squares_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/slse.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/slse.out" ) # ======== COMPLEX EIG TESTS =========================== add_test(NAME "CNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/cnep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/cnep.out" ) add_test(NAME "CSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/csep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/csep.out" ) add_test(NAME "CSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/cse2.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/cse2.out" ) add_test(NAME "CSVD:_Testing_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/csvd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/csvd.out" ) add_test(NAME "CEC:_Testing_COMPLEX_Eigen_Condition_Routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cec.in" " ${CMAKE_CURRENT_BINARY_DIR}/cec.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cec.in" " ${CMAKE_CURRENT_BINARY_DIR}/cec.out" ) add_test(NAME "CES:_Testing_COMPLEX_Nonsymmetric_Schur_Form_Driver" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ced.in" " ${CMAKE_CURRENT_BINARY_DIR}/ced.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ced.in" " ${CMAKE_CURRENT_BINARY_DIR}/ced.out" ) add_test(NAME "CGG:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgg.out" ) add_test(NAME "CGD:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgd.out" ) add_test(NAME "CHB:_Testing_Hermitian_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csb.in" " ${CMAKE_CURRENT_BINARY_DIR}/csb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csb.in" " ${CMAKE_CURRENT_BINARY_DIR}/csb.out" ) add_test(NAME "CSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csg.in" " ${CMAKE_CURRENT_BINARY_DIR}/csg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csg.in" " ${CMAKE_CURRENT_BINARY_DIR}/csg.out" ) add_test(NAME "CGEBAL:_Testing_the_balancing_of_a_COMPLEX_general_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbal.out" ) add_test(NAME "CGEBAK:_Testing_the_back_transformation_of_a_COMPLEX_balanced_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbak.out" ) add_test(NAME "CGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbal.out" ) add_test(NAME "CGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX_balanced_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbak.out" ) add_test(NAME "CBB:_Testing_banded_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbb.out" ) add_test(NAME "CGLM:_Testing_Generalized_Linear_Regression_Model_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/cglm.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/cglm.out" ) add_test(NAME "CGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgqr.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgqr.out" ) add_test(NAME "CGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgsv.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgsv.out" ) add_test(NAME "CCSD:_Testing_CS_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ccsd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ccsd.out" ) add_test(NAME "CLSE:_Testing_Constrained_Linear_Least_Squares_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/clse.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/clse.out" ) # ======== DOUBLE EIG TESTS =========================== add_test(NAME "DNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dnep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dnep.out" ) add_test(NAME "DSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsep.out" ) add_test(NAME "DSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/dse2.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/dse2.out" ) add_test(NAME "DSVD:_Testing_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsvd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsvd.out" ) add_test(NAME "DEC:_Testing_DOUBLE_PRECISION_Eigen_Condition_Routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dec.in" " ${CMAKE_CURRENT_BINARY_DIR}/dec.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dec.in" " ${CMAKE_CURRENT_BINARY_DIR}/dec.out" ) add_test(NAME "DEV:_Testing_DOUBLE_PRECISION_Nonsymmetric_Eigenvalue_Driver" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ded.in" " ${CMAKE_CURRENT_BINARY_DIR}/ded.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ded.in" " ${CMAKE_CURRENT_BINARY_DIR}/ded.out" ) add_test(NAME "DGG:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgg.out" ) add_test(NAME "DGD:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgd.out" ) add_test(NAME "DSB:_Testing_DOUBLE_PRECISION_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsb.out" ) add_test(NAME "DSG:_Testing_DOUBLE_PRECISION_Symmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsg.out" ) add_test(NAME "DGEBAL:_Testing_the_balancing_of_a_DOUBLE_PRECISION_general_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbal.out" ) add_test(NAME "DGEBAK:_Testing_the_back_transformation_of_a_DOUBLE_PRECISION_balanced_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbak.out" ) add_test(NAME "DGGBAL:_Testing_the_balancing_of_a_pair_of_DOUBLE_PRECISION_general_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbal.out" ) add_test(NAME "DGGBAK:_Testing_the_back_transformation_of_a_pair_of_DOUBLE_PRECISION_balanced_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbak.out" ) add_test(NAME "DBB:_Testing_banded_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbb.out" ) add_test(NAME "DGLM:_Testing_Generalized_Linear_Regression_Model_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/dglm.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/dglm.out" ) add_test(NAME "DGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgqr.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgqr.out" ) add_test(NAME "DGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgsv.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgsv.out" ) add_test(NAME "DCSD:_Testing_CS_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dcsd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dcsd.out" ) add_test(NAME "DLSE:_Testing_Constrained_Linear_Least_Squares_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/dlse.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/dlse.out" ) # ======== COMPLEX16 EIG TESTS =========================== add_test(NAME "ZNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/znep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/znep.out" ) add_test(NAME "ZSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsep.out" ) add_test(NAME "ZSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/zse2.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/zse2.out" ) add_test(NAME "ZSVD:_Testing_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsvd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsvd.out" ) add_test(NAME "ZEC:_Testing_COMPLEX16_Eigen_Condition_Routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zec.in" " ${CMAKE_CURRENT_BINARY_DIR}/zec.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zec.in" " ${CMAKE_CURRENT_BINARY_DIR}/zec.out" ) add_test(NAME "ZES:_Testing_COMPLEX16_Nonsymmetric_Schur_Form_Driver" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zed.in" " ${CMAKE_CURRENT_BINARY_DIR}/zed.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zed.in" " ${CMAKE_CURRENT_BINARY_DIR}/zed.out" ) add_test(NAME "ZGG:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgg.out" ) add_test(NAME "ZGD:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgd.out" ) add_test(NAME "ZHB:_Testing_Hermitian_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsb.out" ) add_test(NAME "ZSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsg.out" ) add_test(NAME "ZGEBAL:_Testing_the_balancing_of_a_COMPLEX16_general_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbal.out" ) add_test(NAME "ZGEBAK:_Testing_the_back_transformation_of_a_COMPLEX16_balanced_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbak.out" ) add_test(NAME "ZGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbal.out" ) add_test(NAME "ZGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX16_balanced_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbak.out" ) add_test(NAME "ZBB:_Testing_banded_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbb.out" ) add_test(NAME "ZGLM:_Testing_Generalized_Linear_Regression_Model_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/zglm.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/zglm.out" ) add_test(NAME "ZGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgqr.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgqr.out" ) add_test(NAME "ZGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgsv.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgsv.out" ) add_test(NAME "ZCSD:_Testing_CS_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zcsd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zcsd.out" ) add_test(NAME "Constrained_Linear_Least_Squares_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/zlse.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/zlse.out" ) diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index e877b1422..10c25a446 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -25,7 +25,7 @@ set(AEIGTST set(SCIGTST slafts.f slahd2.f slasum.f slatb9.f sstech.f sstect.f ssvdch.f ssvdct.f ssxt1.f) -set(SEIGTST schkee.f +set(SEIGTST schkee.F sbdt01.f sbdt02.f sbdt03.f sbdt04.f sbdt05.f schkbb.f schkbd.f schkbk.f schkbl.f schkec.f schkgg.f schkgk.f schkgl.f schkhs.f schksb.f schkst.f schkst2stg.f schksb2stg.f @@ -42,7 +42,7 @@ set(SEIGTST schkee.f sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f sstt22.f ssyt21.f ssyt22.f) -set(CEIGTST cchkee.f +set(CEIGTST cchkee.F cbdt01.f cbdt02.f cbdt03.f cbdt05.f cchkbb.f cchkbd.f cchkbk.f cchkbl.f cchkec.f cchkgg.f cchkgk.f cchkgl.f cchkhb.f cchkhs.f cchkst.f cchkst2stg.f cchkhb2stg.f @@ -62,7 +62,7 @@ set(CEIGTST cchkee.f set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f dsvdch.f dsvdct.f dsxt1.f) -set(DEIGTST dchkee.f +set(DEIGTST dchkee.F dbdt01.f dbdt02.f dbdt03.f dbdt04.f dbdt05.f dchkbb.f dchkbd.f dchkbk.f dchkbl.f dchkec.f dchkgg.f dchkgk.f dchkgl.f dchkhs.f dchksb.f dchkst.f dchkst2stg.f dchksb2stg.f @@ -79,7 +79,7 @@ set(DEIGTST dchkee.f dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f dstt22.f dsyt21.f dsyt22.f) -set(ZEIGTST zchkee.f +set(ZEIGTST zchkee.F zbdt01.f zbdt02.f zbdt03.f zbdt05.f zchkbb.f zchkbd.f zchkbk.f zchkbl.f zchkec.f zchkgg.f zchkgk.f zchkgl.f zchkhb.f zchkhs.f zchkst.f zchkst2stg.f zchkhb2stg.f diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile index b3efebcd0..a292e4496 100644 --- a/lapack-netlib/TESTING/EIG/Makefile +++ b/lapack-netlib/TESTING/EIG/Makefile @@ -157,11 +157,11 @@ cleanobj: cleanexe: rm -f xeigtst* -schkee.o: schkee.f +schkee.o: schkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< -dchkee.o: dchkee.f +dchkee.o: dchkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< -cchkee.o: cchkee.f +cchkee.o: cchkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< -zchkee.o: zchkee.f +zchkee.o: zchkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< diff --git a/lapack-netlib/TESTING/EIG/cchkee.f b/lapack-netlib/TESTING/EIG/cchkee.F similarity index 97% rename from lapack-netlib/TESTING/EIG/cchkee.f rename to lapack-netlib/TESTING/EIG/cchkee.F index f2a5f8d41..0d3d7493c 100644 --- a/lapack-netlib/TESTING/EIG/cchkee.f +++ b/lapack-netlib/TESTING/EIG/cchkee.F @@ -1034,6 +1034,10 @@ * ===================================================================== PROGRAM CCHKEE * +#if defined(_OPENMP) + use omp_lib +#endif +* * -- LAPACK test routine (version 3.7.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- @@ -1071,7 +1075,7 @@ CHARACTER*80 LINE INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS REAL EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. @@ -1084,12 +1088,16 @@ INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), $ ISHFTS( MAXIN ), IACC22( MAXIN ) REAL ALPHA( NMAX ), BETA( NMAX ), DR( NMAX, 12 ), - $ RESULT( 500 ), RWORK( LWORK ), S( NMAX*NMAX ) - COMPLEX A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), - $ C( NCMAX*NCMAX, NCMAX*NCMAX ), DC( NMAX, 6 ), - $ TAUA( NMAX ), TAUB( NMAX ), WORK( LWORK ), + $ RESULT( 500 ) + COMPLEX DC( NMAX, 6 ), TAUA( NMAX ), TAUB( NMAX ), $ X( 5*NMAX ) * .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + REAL, DIMENSION(:), ALLOCATABLE :: RWORK, S + COMPLEX, DIMENSION(:), ALLOCATABLE :: WORK + COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, C +* .. * .. External Functions .. LOGICAL LSAMEN REAL SECOND, SLAMCH @@ -1130,6 +1138,21 @@ DATA INTSTR / '0123456789' / DATA IOLDSD / 0, 0, 0, 1 / * .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( S(NMAX*NMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( RWORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* .. * .. Executable Statements .. * A = 0.0 @@ -1846,8 +1869,16 @@ CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) CALL XLAENV( 1, 1 ) CALL XLAENV( 9, 25 ) - IF( TSTERR ) - $ CALL CERRST( 'CST', NOUT ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_NUM_THREADS() + CALL OMP_SET_NUM_THREADS(1) +#endif + CALL CERRST( 'CST', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF DO 290 I = 1, NPARMS CALL XLAENV( 1, NBVAL( I ) ) CALL XLAENV( 2, NBMIN( I ) ) @@ -2305,8 +2336,16 @@ MAXTYP = 15 NTYPES = MIN( MAXTYP, NTYPES ) CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - IF( TSTERR ) - $ CALL CERRST( 'CHB', NOUT ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_NUM_THREADS() + CALL OMP_SET_NUM_THREADS(1) +#endif + CALL CERRST( 'CHB', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF * CALL CCHKHB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH, * $ NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), DR( 1, 2 ), * $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT, @@ -2436,7 +2475,14 @@ 380 CONTINUE WRITE( NOUT, FMT = 9994 ) S2 = SECOND( ) - WRITE( NOUT, FMT = 9993 )S2 - S1 + WRITE( NOUT, FMT = 9993 )S2 - S1 +* + DEALLOCATE (S, STAT = AllocateStatus) + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) * 9999 FORMAT( / ' Execution not attempted due to input errors' ) 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) diff --git a/lapack-netlib/TESTING/EIG/dchkee.f b/lapack-netlib/TESTING/EIG/dchkee.F similarity index 98% rename from lapack-netlib/TESTING/EIG/dchkee.f rename to lapack-netlib/TESTING/EIG/dchkee.F index dc6f3205a..ee22ce33d 100644 --- a/lapack-netlib/TESTING/EIG/dchkee.f +++ b/lapack-netlib/TESTING/EIG/dchkee.F @@ -1038,7 +1038,11 @@ *> \ingroup double_eig * * ===================================================================== - PROGRAM DCHKEE + PROGRAM DCHKEE +* +#if defined(_OPENMP) + use omp_lib +#endif * * -- LAPACK test routine (version 3.7.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -1077,7 +1081,7 @@ CHARACTER*80 LINE INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. @@ -1089,10 +1093,13 @@ $ PVAL( MAXIN ) INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), $ ISHFTS( MAXIN ), IACC22( MAXIN ) - DOUBLE PRECISION A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), - $ C( NCMAX*NCMAX, NCMAX*NCMAX ), D( NMAX, 12 ), - $ RESULT( 500 ), TAUA( NMAX ), TAUB( NMAX ), - $ WORK( LWORK ), X( 5*NMAX ) + DOUBLE PRECISION D( NMAX, 12 ), RESULT( 500 ), TAUA( NMAX ), + $ TAUB( NMAX ), X( 5*NMAX ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: WORK + DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, C * .. * .. External Functions .. LOGICAL LSAMEN @@ -1132,7 +1139,18 @@ * .. * .. Data statements .. DATA INTSTR / '0123456789' / - DATA IOLDSD / 0, 0, 0, 1 / + DATA IOLDSD / 0, 0, 0, 1 / +* .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" * .. * .. Executable Statements .. * @@ -1856,8 +1874,16 @@ CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) CALL XLAENV( 1, 1 ) CALL XLAENV( 9, 25 ) - IF( TSTERR ) - $ CALL DERRST( 'DST', NOUT ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_NUM_THREADS() + CALL OMP_SET_NUM_THREADS(1) +#endif + CALL DERRST( 'DST', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF DO 290 I = 1, NPARMS CALL XLAENV( 1, NBVAL( I ) ) CALL XLAENV( 2, NBMIN( I ) ) @@ -2436,7 +2462,12 @@ 380 CONTINUE WRITE( NOUT, FMT = 9994 ) S2 = DSECND( ) - WRITE( NOUT, FMT = 9993 )S2 - S1 + WRITE( NOUT, FMT = 9993 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) * 9999 FORMAT( / ' Execution not attempted due to input errors' ) 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) diff --git a/lapack-netlib/TESTING/EIG/schkee.f b/lapack-netlib/TESTING/EIG/schkee.F similarity index 98% rename from lapack-netlib/TESTING/EIG/schkee.f rename to lapack-netlib/TESTING/EIG/schkee.F index 3757e0655..a063c18b5 100644 --- a/lapack-netlib/TESTING/EIG/schkee.f +++ b/lapack-netlib/TESTING/EIG/schkee.F @@ -1040,6 +1040,10 @@ * ===================================================================== PROGRAM SCHKEE * +#if defined(_OPENMP) + use omp_lib +#endif +* * -- LAPACK test routine (version 3.7.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- @@ -1077,7 +1081,7 @@ CHARACTER*80 LINE INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS REAL EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. @@ -1089,10 +1093,13 @@ $ PVAL( MAXIN ) INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), $ ISHFTS( MAXIN ), IACC22( MAXIN ) - REAL A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), - $ C( NCMAX*NCMAX, NCMAX*NCMAX ), D( NMAX, 12 ), - $ RESULT( 500 ), TAUA( NMAX ), TAUB( NMAX ), - $ WORK( LWORK ), X( 5*NMAX ) + REAL D( NMAX, 12 ), RESULT( 500 ), TAUA( NMAX ), + $ TAUB( NMAX ), X( 5*NMAX ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + REAL, DIMENSION(:), ALLOCATABLE :: WORK + REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, C * .. * .. External Functions .. LOGICAL LSAMEN @@ -1132,7 +1139,18 @@ * .. * .. Data statements .. DATA INTSTR / '0123456789' / - DATA IOLDSD / 0, 0, 0, 1 / + DATA IOLDSD / 0, 0, 0, 1 / +* .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" * .. * .. Executable Statements .. * @@ -1857,8 +1875,16 @@ CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) CALL XLAENV( 1, 1 ) CALL XLAENV( 9, 25 ) - IF( TSTERR ) - $ CALL SERRST( 'SST', NOUT ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_NUM_THREADS() + CALL OMP_SET_NUM_THREADS(1) +#endif + CALL SERRST( 'SST', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF DO 290 I = 1, NPARMS CALL XLAENV( 1, NBVAL( I ) ) CALL XLAENV( 2, NBMIN( I ) ) @@ -2440,6 +2466,11 @@ WRITE( NOUT, FMT = 9994 ) S2 = SECOND( ) WRITE( NOUT, FMT = 9993 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) * 9999 FORMAT( / ' Execution not attempted due to input errors' ) 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) diff --git a/lapack-netlib/TESTING/EIG/zchkee.f b/lapack-netlib/TESTING/EIG/zchkee.F similarity index 97% rename from lapack-netlib/TESTING/EIG/zchkee.f rename to lapack-netlib/TESTING/EIG/zchkee.F index 6807ef7e4..29604956d 100644 --- a/lapack-netlib/TESTING/EIG/zchkee.f +++ b/lapack-netlib/TESTING/EIG/zchkee.F @@ -1034,6 +1034,10 @@ * ===================================================================== PROGRAM ZCHKEE * +#if defined(_OPENMP) + use omp_lib +#endif +* * -- LAPACK test routine (version 3.7.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- @@ -1071,7 +1075,7 @@ CHARACTER*80 LINE INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. @@ -1084,12 +1088,16 @@ INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), $ ISHFTS( MAXIN ), IACC22( MAXIN ) DOUBLE PRECISION ALPHA( NMAX ), BETA( NMAX ), DR( NMAX, 12 ), - $ RESULT( 500 ), RWORK( LWORK ), S( NMAX*NMAX ) - COMPLEX*16 A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), - $ C( NCMAX*NCMAX, NCMAX*NCMAX ), DC( NMAX, 6 ), - $ TAUA( NMAX ), TAUB( NMAX ), WORK( LWORK ), + $ RESULT( 500 ) + COMPLEX*16 DC( NMAX, 6 ), TAUA( NMAX ), TAUB( NMAX ), $ X( 5*NMAX ) * .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK, S + COMPLEX*16, DIMENSION(:), ALLOCATABLE :: WORK + COMPLEX*16, DIMENSION(:,:), ALLOCATABLE :: A, B, C +* .. * .. External Functions .. LOGICAL LSAMEN DOUBLE PRECISION DLAMCH, DSECND @@ -1130,6 +1138,21 @@ DATA INTSTR / '0123456789' / DATA IOLDSD / 0, 0, 0, 1 / * .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( S(NMAX*NMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( RWORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* .. * .. Executable Statements .. * A = 0.0 @@ -1846,8 +1869,16 @@ CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) CALL XLAENV( 1, 1 ) CALL XLAENV( 9, 25 ) - IF( TSTERR ) - $ CALL ZERRST( 'ZST', NOUT ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_NUM_THREADS() + CALL OMP_SET_NUM_THREADS(1) +#endif + CALL ZERRST( 'ZST', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF DO 290 I = 1, NPARMS CALL XLAENV( 1, NBVAL( I ) ) CALL XLAENV( 2, NBMIN( I ) ) @@ -2303,8 +2334,16 @@ MAXTYP = 15 NTYPES = MIN( MAXTYP, NTYPES ) CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - IF( TSTERR ) - $ CALL ZERRST( 'ZHB', NOUT ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_NUM_THREADS() + CALL OMP_SET_NUM_THREADS(1) +#endif + CALL ZERRST( 'ZHB', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF * CALL ZCHKHB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH, * $ NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), DR( 1, 2 ), * $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT, @@ -2435,6 +2474,13 @@ WRITE( NOUT, FMT = 9994 ) S2 = DSECND( ) WRITE( NOUT, FMT = 9993 )S2 - S1 +* + DEALLOCATE (S, STAT = AllocateStatus) + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) * 9999 FORMAT( / ' Execution not attempted due to input errors' ) 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) diff --git a/param.h b/param.h index 6c5e0f107..c41f75ec9 100644 --- a/param.h +++ b/param.h @@ -72,6 +72,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef PARAM_H #define PARAM_H +#define LONGCAST (BLASLONG) +#if defined(__BYTE_ORDER__) +#if __GNUC__ < 9 +#undef LONGCAST +#define LONGCAST +#endif +#endif + #define SBGEMM_DEFAULT_UNROLL_N 4 #define SBGEMM_DEFAULT_UNROLL_M 8 #define SBGEMM_DEFAULT_UNROLL_MN 32 @@ -85,7 +93,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 256 -#define GEMM_DEFAULT_ALIGN 0x01ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x01ffffUL #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 @@ -157,7 +165,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 -#define GEMM_DEFAULT_ALIGN 0x0fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 @@ -237,7 +245,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 -#define GEMM_DEFAULT_ALIGN 0x0fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL @@ -330,7 +338,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 -#define GEMM_DEFAULT_ALIGN 0x0fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL @@ -422,7 +430,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 -#define GEMM_DEFAULT_ALIGN 0x0fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL @@ -515,7 +523,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 -#define GEMM_DEFAULT_ALIGN 0x0fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL @@ -607,7 +615,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -726,7 +734,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 384 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 @@ -774,7 +782,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 256 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 @@ -821,7 +829,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 256 -#define GEMM_DEFAULT_ALIGN 0x01ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x01ffffUL #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -890,7 +898,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #ifdef HAVE_SSE #define SGEMM_DEFAULT_UNROLL_M 8 @@ -945,7 +953,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #ifdef CORE_YONAH #define SGEMM_DEFAULT_UNROLL_M 4 @@ -1011,7 +1019,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 32 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SYMV_P 8 @@ -1068,7 +1076,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 256 #endif -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SYMV_P 8 @@ -1128,7 +1136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 448 #define GEMM_DEFAULT_OFFSET_B 128 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -1201,7 +1209,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 128 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -1272,7 +1280,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 128 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -1344,7 +1352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 32 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -1417,7 +1425,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -1510,7 +1518,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -1636,7 +1644,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -1877,7 +1885,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SYMV_P 8 @@ -1939,7 +1947,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 128 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -1993,7 +2001,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 512 #define GEMM_DEFAULT_OFFSET_B 512 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2061,7 +2069,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 8192 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2088,7 +2096,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef PPCG4 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 1024 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2119,7 +2127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 2688 #define GEMM_DEFAULT_OFFSET_B 3072 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN LONGCAST 0x03fffUL #if defined(__BYTE_ORDER__)&&(__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) #define SGEMM_DEFAULT_UNROLL_M 4 @@ -2168,7 +2176,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A (32 * 0) #define GEMM_DEFAULT_OFFSET_B (32 * 0) -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2204,7 +2212,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A (32 * 0) #define GEMM_DEFAULT_OFFSET_B (32 * 0) -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2239,7 +2247,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER3) || defined(POWER4) || defined(POWER5) #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 2048 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2312,7 +2320,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 384 #define GEMM_DEFAULT_OFFSET_B 1024 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN LONGCAST 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2344,7 +2352,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL #if defined(__32BIT__) #warning using BINARY32==POWER6 #define SGEMM_DEFAULT_UNROLL_M 4 @@ -2397,7 +2406,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL + +#define SWITCH_RATIO 16 +#define GEMM_PREFERED_SIZE 16 #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -2433,24 +2445,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL + +#define SWITCH_RATIO 16 +#define GEMM_PREFERED_SIZE 16 #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#define DGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_N 4 +#else #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 8 +#endif #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 832 -#define DGEMM_DEFAULT_P 320 +#define SGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_P 384 #define CGEMM_DEFAULT_P 512 #define ZGEMM_DEFAULT_P 256 -#define SGEMM_DEFAULT_Q 1026 -#define DGEMM_DEFAULT_Q 960 +#define SGEMM_DEFAULT_Q 512 +#define DGEMM_DEFAULT_Q 512 #define CGEMM_DEFAULT_Q 1026 #define ZGEMM_DEFAULT_Q 1026 @@ -2480,7 +2500,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 2048 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -2512,7 +2532,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 2048 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2543,7 +2563,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -2578,7 +2598,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #ifdef HAVE_MSA #define SGEMM_DEFAULT_UNROLL_M 8 @@ -2634,7 +2654,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2675,7 +2695,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG) 0x03fffUL #ifdef HAVE_MSA #define SGEMM_DEFAULT_UNROLL_M 8 @@ -2724,7 +2744,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef RISCV64_GENERIC #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 @@ -2805,7 +2825,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2846,7 +2866,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 2 @@ -3121,7 +3141,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 @@ -3162,7 +3182,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -3203,7 +3223,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -3244,7 +3264,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 @@ -3283,7 +3303,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -3365,7 +3385,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_N 2 diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 360ff2151..d338242ff 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -22,6 +22,20 @@ target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME}) endforeach() # $1 exec, $2 input, $3 output_result +if(WIN32) +FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1 +"if (Test-Path $args[2]) { Remove-Item -Force $args[2] } \n" +"$ErrorActionPreference = \"Stop\"\n" +"Get-Content $args[1] | & $args[0]\n" +"If ((Get-Content $args[2] | %{$_ -match \"FATAL\"}) -contains $true) {\n" +"echo Error\n" +"exit 1\n" +"} else {\n" +"exit 0\n" +"}\n" +) +set(helper_prefix powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1") +else() FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh "rm -f $3\n" "$1 < $2\n" @@ -33,6 +47,8 @@ FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh "exit 0\n" "fi\n" ) +set(helper_prefix sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh") +endif() #set(float_types s d c z) if (BUILD_SINGLE) @@ -50,9 +66,9 @@ endif() foreach(float_type ${float_types}) string(TOUPPER ${float_type} float_type_upper) add_test(NAME "${float_type}blas1" - COMMAND "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat1") + COMMAND $) add_test(NAME "${float_type}blas2" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat2" "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM) + COMMAND ${helper_prefix} $ "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM) add_test(NAME "${float_type}blas3" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat3" "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM) + COMMAND ${helper_prefix} $ "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM) endforeach() diff --git a/test/Makefile b/test/Makefile index 5f653414a..54fa60533 100644 --- a/test/Makefile +++ b/test/Makefile @@ -270,6 +270,9 @@ ifeq ($(C_COMPILER), CLANG) CEXTRALIB = -lomp endif endif +ifeq ($(F_COMPILER), NAG) +CEXTRALIB = -lgomp +endif endif ifeq ($(BUILD_SINGLE),1)