From 5a9cce2bf6740110b93a534f876072f220d928d1 Mon Sep 17 00:00:00 2001 From: Fabrice Fontaine Date: Sun, 5 May 2019 18:37:28 +0200 Subject: [PATCH 01/44] Makefile.arm: remove -march flags The provided -march flags, especially for ARMv5 and ARMv6 may not necessarily match the needed ones: for ARMv5, it might be armv5, armv5te, armv5t, etc. If the wrong one is used, the incorrect toolchain sysroot can be used in a multilib toolchain. Therefore, let the user building OpenBLAS pass the appropriate -march flag. The other flags, such as -mfpu=vfp or -mfloat-abi=hard are kept, as they are actually required for the build to proceed (OpenBLAS uses VFP instructions, and assume an EABIhf ABI). [Peter: update for v0.2.20] Signed-off-by: Thomas Petazzoni Signed-off-by: Peter Korsgaard [Retrieved from: https://git.buildroot.net/buildroot/tree/package/openblas/0001-Makefile.arm-remove-march-flags.patch] Signed-off-by: Fabrice Fontaine --- Makefile.arm | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/Makefile.arm b/Makefile.arm index eedd39b73..b5d80f8e6 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -1,7 +1,7 @@ ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15)) ifeq ($(OSNAME), Android) -CCOMMON_OPT += -mfpu=neon -march=armv7-a -FCOMMON_OPT += -mfpu=neon -march=armv7-a +CCOMMON_OPT += -mfpu=neon +FCOMMON_OPT += -mfpu=neon else CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a @@ -9,11 +9,6 @@ endif endif ifeq ($(CORE), ARMV6) -CCOMMON_OPT += -mfpu=vfp -march=armv6 -FCOMMON_OPT += -mfpu=vfp -march=armv6 -endif - -ifeq ($(CORE), ARMV5) -CCOMMON_OPT += -march=armv5 -FCOMMON_OPT += -march=armv5 +CCOMMON_OPT += -mfpu=vfp +FCOMMON_OPT += -mfpu=vfp endif From 9086543f503f63d9107ce539650f28918b027015 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jul 2019 14:29:47 +0200 Subject: [PATCH 02/44] Utest needs CBLAS but not necessarily FORTRAN --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 50da721cd..d7d9c2fce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -211,7 +211,8 @@ if (USE_THREAD) target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT}) endif() -if (MSVC OR NOT NOFORTRAN) +#if (MSVC OR NOT NOFORTRAN) +if (NOT NO_CBLAS) # Broken without fortran on unix add_subdirectory(utest) endif() From ae9e8b131e27f65684cf4cb98e03b7df4b290142 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jul 2019 14:30:33 +0200 Subject: [PATCH 03/44] Add mingw builds to Appveyor config --- appveyor.yml | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 44a616aaa..2f9cc7b0b 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -35,7 +35,14 @@ environment: DYNAMIC_ARCH: ON WITH_FORTRAN: no - COMPILER: cl - + - COMPILER: MinGW64-gcc-7.2.0-mingw + DYNAMIC_ARCH: OFF + WITH_FORTRAN: ignore + - COMPILER: MinGW64-gcc-7.2.0 + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 + COMPILER: MinGW-gcc-5.3.0 + WITH_FORTRAN: ignore + install: - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force @@ -52,7 +59,14 @@ install: before_build: - ps: if (-Not (Test-Path .\build)) { mkdir build } - cd build + - set PATH=%PATH:C:\Program Files\Git\usr\bin;=% + - if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% + - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% + - if [%COMPILER%]==[MinGW64-gcc-7.2.0] set PATH=C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. + - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. + - if [%COMPILER%]==[MinGW64-gcc-7.2.0] cmake -G "MSYS Makefiles" -DBINARY=32 -DNOFORTRAN=1 .. + - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. @@ -64,3 +78,4 @@ test_script: - echo Running Test - cd utest - openblas_utest + From f69a0be712a9dccf5fcf433a734eb1371cb6189a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jul 2019 15:02:39 +0200 Subject: [PATCH 04/44] Add getarch flags to disable AVX on x86 (and other small fixes to match Makefile behaviour) --- cmake/system.cmake | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 7f3696286..1c2093efe 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -70,6 +70,13 @@ if (X86_64) set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native") endif () +# On x86 no AVX support is available +if (X86 OR X86_64) +if ((DEFINED BINARY AND BINARY EQUAL 32) OR ("$CMAKE_SIZEOF_VOID_P}" EQUAL "4")) + set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX -DNO_AVX2 -DNO_AVX512") +endif () +endif () + if (INTERFACE64) message(STATUS "Using 64-bit integers.") set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT") @@ -148,7 +155,9 @@ else() endif () include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") - +if (DEFINED BINARY) + message(STATUS "Compiling a ${BINARY}-bit binary.") +endif () if (NOT DEFINED NEED_PIC) set(NEED_PIC 1) endif () @@ -165,6 +174,9 @@ include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake") if (NOT NOFORTRAN) # Fortran Compiler dependent settings include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") +else () +set(NO_LAPACK 1) +set(NO_LAPACKE 1) endif () if (BINARY64) @@ -190,9 +202,14 @@ if (NEED_PIC) endif () if (DYNAMIC_ARCH) - set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") - if (DYNAMIC_OLDER) - set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") + if (X86 OR X86_64 OR ARM64 OR PPC) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") + if (DYNAMIC_OLDER) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") + endif () + else () + unset (DYNAMIC_ARCH) + message (STATUS "DYNAMIC_ARCH is not supported on the target architecture, removing") endif () endif () From 04d671aae2b452a0bf63837c289f8948c35eb675 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jul 2019 15:05:04 +0200 Subject: [PATCH 05/44] Make disabling DYNAMIC_ARCH on unsupported systems work needs to be unset in the cache for the change to have any effect --- cmake/arch.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index b4547b7c9..5a7434551 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -81,7 +81,8 @@ if (DYNAMIC_ARCH) endif () if (NOT DYNAMIC_CORE) - unset(DYNAMIC_ARCH) + message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options") + unset(DYNAMIC_ARCH CACHE) endif () endif () From 8fb76134bc0711634b410fa20d6eb113f8893a04 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jul 2019 15:07:15 +0200 Subject: [PATCH 06/44] Mingw32 needs leading underscore on object names (also copy BUNDERSCORE settings for FORTRAN from the corresponding Makefile) --- cmake/prebuild.cmake | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index a67c44bf5..e508a46c2 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -59,6 +59,9 @@ set(FU "") if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")) set(FU "_") endif() +if(MINGW AND NOT MINGW64) + set(FU "_") +endif() set(COMPILER_ID ${CMAKE_C_COMPILER_ID}) if (${COMPILER_ID} STREQUAL "GNU") @@ -82,6 +85,11 @@ endif () # f_check if (NOT NOFORTRAN) include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake") +else () + file(APPEND ${TARGET_CONF_TEMP} + "#define BUNDERSCORE _\n" + "#define NEEDBUNDERSCORE 1\n") + set(BU "_") endif () # Cannot run getarch on target if we are cross-compiling From b89c781637503ec66117eb3b887a3755d42f0f46 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jul 2019 16:04:45 +0200 Subject: [PATCH 07/44] Fix surprising behaviour of NO_AFFINITY=0 --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index 16791bcc2..09a648e4a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1124,8 +1124,12 @@ endif endif ifdef NO_AFFINITY +ifeq ($(NO_AFFINITY), 0) +override undefine NO_AFFINITY +else CCOMMON_OPT += -DNO_AFFINITY endif +endif ifdef FUNCTION_PROFILE CCOMMON_OPT += -DFUNCTION_PROFILE From b89d9762a29ac84422ebb6092584831efd85d355 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Mon, 8 Jul 2019 17:13:21 -0500 Subject: [PATCH 08/44] Change install_name on osx to match linux --- Makefile | 1 + Makefile.install | 3 ++- exports/Makefile | 8 ++++++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 07b08439e..60f189ef2 100644 --- a/Makefile +++ b/Makefile @@ -109,6 +109,7 @@ endif ifeq ($(OSNAME), Darwin) @$(MAKE) -C exports dyn @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib + @ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib endif ifeq ($(OSNAME), WINNT) @$(MAKE) -C exports dll diff --git a/Makefile.install b/Makefile.install index fefecd98d..8070b4729 100644 --- a/Makefile.install +++ b/Makefile.install @@ -83,7 +83,8 @@ ifeq ($(OSNAME), Darwin) @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ - ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib + ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \ + ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib endif ifeq ($(OSNAME), WINNT) @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" diff --git a/exports/Makefile b/exports/Makefile index b1348bd4a..d32e449df 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -105,6 +105,10 @@ $(LIBPREFIX).def : gensymbol libgoto_hpl.def : gensymbol perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) +ifeq ($(OSNAME), Darwin) +INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib +endif + ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) $(LIBDYNNAME) : ../$(LIBNAME) osx.def else @@ -114,9 +118,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def endif ifneq (,$(filter 1 2,$(NOFORTRAN))) #only build without Fortran - $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) else - $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) endif dllinit.$(SUFFIX) : dllinit.c From 0ba29fd2625dfe405a08005a22d0fa21293cc16c Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 00:46:51 +0800 Subject: [PATCH 09/44] Update dgemm_kernel_4x8_haswell.S for zen2 replaced a bunch of vpermpd instructions with vpermilpd and vperm2f128 --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 120 ++++++++++------------- 1 file changed, 54 insertions(+), 66 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index c84b599ce..5416018bb 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 ,%ymm2 , %ymm8 vmulpd %ymm0 ,%ymm3 , %ymm12 prefetcht0 B_PR1+256(BO) - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm5 vmulpd %ymm0 ,%ymm2 , %ymm9 vmulpd %ymm0 ,%ymm3 , %ymm13 @@ -153,7 +153,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addq $ 12*SIZE, BO vmulpd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vmulpd %ymm0 ,%ymm2 , %ymm11 @@ -172,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm8 prefetcht0 B_PR1+128(BO) vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 @@ -181,7 +181,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm10 vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -196,7 +196,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 @@ -206,7 +206,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups 0 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -222,7 +222,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 @@ -232,7 +232,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 @@ -247,7 +247,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vmovups -4 * SIZE(BO), %ymm3 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 addq $ 12*SIZE, BO @@ -257,7 +257,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 4*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 @@ -284,18 +284,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -324,18 +322,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 32(%rax) prefetcht0 32(%rax,LDC) - vpermpd $ 0xb1 , %ymm9 , %ymm9 - vpermpd $ 0xb1 , %ymm11, %ymm11 + vpermilpd $ 0x05 , %ymm9 , %ymm9 + vpermilpd $ 0x05 , %ymm11, %ymm11 vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -365,18 +361,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 32(%rbp) prefetcht0 32(%rbp,LDC) - vpermpd $ 0xb1 , %ymm13, %ymm13 - vpermpd $ 0xb1 , %ymm15, %ymm15 + vpermilpd $ 0x05 , %ymm13, %ymm13 + vpermilpd $ 0x05 , %ymm15, %ymm15 vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -687,7 +681,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -8 * SIZE(BO), %ymm2 vmulpd %ymm0 ,%ymm1 , %ymm4 vmulpd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm5 vmulpd %ymm0 ,%ymm2 , %ymm9 vpermpd $ 0x1b, %ymm0 , %ymm0 @@ -695,7 +689,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, BO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vmulpd %ymm0 ,%ymm2 , %ymm11 @@ -710,14 +704,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm1 , %ymm4 prefetcht0 B_PR1+64(BO) vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -729,7 +723,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vpermpd $ 0x1b, %ymm0 , %ymm0 @@ -737,7 +731,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -4 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -750,7 +744,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vpermpd $ 0x1b, %ymm0 , %ymm0 @@ -758,7 +752,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 addq $ 8*SIZE, BO @@ -770,7 +764,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vmovups -8 * SIZE(BO), %ymm2 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 addq $ 8*SIZE, BO @@ -778,7 +772,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 4*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -799,18 +793,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -839,18 +831,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 32(%rax) prefetcht0 32(%rax,LDC) - vpermpd $ 0xb1 , %ymm9 , %ymm9 - vpermpd $ 0xb1 , %ymm11, %ymm11 + vpermilpd $ 0x05 , %ymm9 , %ymm9 + vpermilpd $ 0x05 , %ymm11, %ymm11 vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -1084,13 +1074,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -12 * SIZE(BO), %ymm1 vmovups -16 * SIZE(AO), %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm6 addq $ 4*SIZE, BO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 @@ -1100,12 +1090,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 A_PR1(AO) vmovups -16 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 @@ -1114,13 +1104,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_M2 vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -8 * SIZE(BO), %ymm1 addq $ 8*SIZE, BO @@ -1130,13 +1120,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_E vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 addq $ 4*SIZE, BO .endm @@ -1145,13 +1135,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -12 * SIZE(BO), %ymm1 vmovups -16 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 addq $ 4*SIZE, BO vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 4*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 .endm @@ -1165,18 +1155,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 From 7a9050d6817dd63e4b3cb641566b03f069be47a9 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 00:55:06 +0800 Subject: [PATCH 10/44] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 5416018bb..b98610524 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -292,8 +292,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -330,8 +330,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -369,8 +369,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -801,8 +801,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -839,8 +839,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -1163,8 +1163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 From 182b06d6adb445d00066eff3b15da335ee1656bc Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 17:02:35 +0800 Subject: [PATCH 11/44] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 40 ++++++++++++------------ 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index b98610524..814a1c350 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -317,10 +317,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rax) vmovups %ymm7 , (%rax, LDC) - prefetcht0 32(CO1) - prefetcht0 32(CO1,LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) + prefetcht0 56(CO1) + prefetcht0 56(CO1,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) vpermilpd $ 0x05 , %ymm9 , %ymm9 vpermilpd $ 0x05 , %ymm11, %ymm11 @@ -356,10 +356,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) + prefetcht0 56(%rbp) + prefetcht0 56(%rbp,LDC) vpermilpd $ 0x05 , %ymm13, %ymm13 vpermilpd $ 0x05 , %ymm15, %ymm15 @@ -395,10 +395,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) + prefetcht0 56(%rbp) + prefetcht0 56(%rbp,LDC) addq $ 4*SIZE, CO1 .endm @@ -826,10 +826,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rax) vmovups %ymm7 , (%rax, LDC) - prefetcht0 32(CO1) - prefetcht0 32(CO1,LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) + prefetcht0 56(CO1) + prefetcht0 56(CO1,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) vpermilpd $ 0x05 , %ymm9 , %ymm9 vpermilpd $ 0x05 , %ymm11, %ymm11 @@ -865,10 +865,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) + prefetcht0 56(%rbp) + prefetcht0 56(%rbp,LDC) addq $ 4*SIZE, CO1 .endm From 1733f927e6b892610bda045538a42d495faa1af5 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 21:27:41 +0800 Subject: [PATCH 12/44] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 814a1c350..b30ecccea 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -106,7 +106,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define A_PR1 512 -#define B_PR1 512 +#define B_PR1 160 /******************************************************************************************* * Macro definitions From 211ab03b1402a3c39311b7ca769aaad736ca554c Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 22:39:15 +0800 Subject: [PATCH 13/44] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index b30ecccea..3f7f9a98e 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -267,23 +267,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x12 + prefetcht0 128(%rsp) /*BUFFER 1*/ vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 vmulpd %ymm0 , %ymm7 , %ymm7 - + prefetcht0 192(%rsp) vmulpd %ymm0 , %ymm8 , %ymm8 vmulpd %ymm0 , %ymm9 , %ymm9 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 - + prefetcht0 256(%rsp) vmulpd %ymm0 , %ymm12, %ymm12 vmulpd %ymm0 , %ymm13, %ymm13 vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 - + prefetcht0 320(%rsp) vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 From 8a074b39656636ebec5812532b486cf751231a3b Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 23:47:30 +0800 Subject: [PATCH 14/44] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 42 +++++++++++++++++++++--- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 3f7f9a98e..5242e3efe 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -267,24 +267,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x12 - prefetcht0 128(%rsp) /*BUFFER 1*/ + prefetcht0 BUFFER1 vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 vmulpd %ymm0 , %ymm7 , %ymm7 - prefetcht0 192(%rsp) + prefetcht0 64 + BUFFER1 vmulpd %ymm0 , %ymm8 , %ymm8 vmulpd %ymm0 , %ymm9 , %ymm9 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 - prefetcht0 256(%rsp) + prefetcht0 128 + BUFFER1 vmulpd %ymm0 , %ymm12, %ymm12 vmulpd %ymm0 , %ymm13, %ymm13 vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 - prefetcht0 320(%rsp) + prefetcht0 192 + BUFFER1 vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 @@ -1606,6 +1606,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm +.macro PREFETCHT0_C + prefetcht0 (CO1) + prefetcht0 24(CO1) + prefetcht0 (CO1,LDC,4) + prefetcht0 24(CO1,LDC,4) + prefetcht0 (CO1,LDC,8) + prefetcht0 24(CO1,LDC,8) + addq LDC,CO1 + prefetcht0 (CO1) + prefetcht0 24(CO1) + prefetcht0 (CO1,LDC,4) + prefetcht0 24(CO1,LDC,4) + prefetcht0 (CO1,LDC,8) + prefetcht0 24(CO1,LDC,8) + leaq (CO1,LDC,2),CO1 + prefetcht0 (CO1) + prefetcht0 24(CO1) + prefetcht0 (CO1,LDC,4) + prefetcht0 24(CO1,LDC,4) + prefetcht0 (CO1,LDC,8) + prefetcht0 24(CO1,LDC,8) + subq LDC,CO1 + prefetcht0 (CO1) + prefetcht0 24(CO1) + prefetcht0 (CO1,LDC,4) + prefetcht0 24(CO1,LDC,4) + prefetcht0 (CO1,LDC,8) + prefetcht0 24(CO1,LDC,8) + subq LDC,CO1 + subq LDC,CO1 +.endm /*******************************************************************************************/ #if !defined(TRMMKERNEL) @@ -1773,7 +1804,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. dec %rax jne .L12_12 - + + PREFETCHT0_C .L12_12a: KERNEL4x12_M1 From 9b04baeaeeaaaeba8c12e3fc2418ceaeca53ebb0 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 23:50:03 +0800 Subject: [PATCH 15/44] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 5242e3efe..42692f33b 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -318,10 +318,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rax) vmovups %ymm7 , (%rax, LDC) - prefetcht0 56(CO1) - prefetcht0 56(CO1,LDC) - prefetcht0 56(%rax) - prefetcht0 56(%rax,LDC) + prefetcht1 56(CO1) + prefetcht1 56(CO1,LDC) + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) vpermilpd $ 0x05 , %ymm9 , %ymm9 vpermilpd $ 0x05 , %ymm11, %ymm11 @@ -357,10 +357,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 56(%rax) - prefetcht0 56(%rax,LDC) - prefetcht0 56(%rbp) - prefetcht0 56(%rbp,LDC) + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) + prefetcht1 56(%rbp) + prefetcht1 56(%rbp,LDC) vpermilpd $ 0x05 , %ymm13, %ymm13 vpermilpd $ 0x05 , %ymm15, %ymm15 @@ -396,10 +396,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 56(%rax) - prefetcht0 56(%rax,LDC) - prefetcht0 56(%rbp) - prefetcht0 56(%rbp,LDC) + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) + prefetcht1 56(%rbp) + prefetcht1 56(%rbp,LDC) addq $ 4*SIZE, CO1 .endm From 9c89757562f43af48645a6563161909321077646 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 19 Jul 2019 23:47:58 +0800 Subject: [PATCH 16/44] Add files via upload --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 29 +++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 42692f33b..e26bddea3 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1865,6 +1865,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 + salq $3, K + prefetcht2 32(B) + prefetcht2 32(B, K, 8) + prefetcht2 96(B) + prefetcht2 96(B, K, 8) + addq $128, B + sarq $3, K + decq I # i -- jne .L12_11 ALIGN_4 @@ -1872,6 +1880,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************** * Rest of M ***************************************************************************/ + movq M, I + sarq $2, I + salq $7, I + subq I, B + .L12_20: // Test rest of M @@ -2102,7 +2115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. jmp .L13_16 - + PREFETCHT0_C .L13_13: test $1, %rax @@ -2147,6 +2160,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 + salq $3, K + prefetcht2 (B) + prefetcht2 (B, K, 8) + prefetcht2 64(B) + prefetcht2 64(B, K, 8) + addq $128, B + sarq $3, K + decq I # i -- jne .L13_11 ALIGN_4 @@ -2154,6 +2175,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************** * Rest of M ***************************************************************************/ + + movq M, I + sarq $2, I + salq $7, I + subq I, B + .L13_20: // Test rest of M From 825777faab163326f38a0e6203ef1fb6fa8de6af Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 19 Jul 2019 23:58:24 +0800 Subject: [PATCH 17/44] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index e26bddea3..225af3673 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1865,12 +1865,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 + /* here for the prefetch of next b source block */ + /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ + /* currently an increment of 128 byte is suitable */ salq $3, K prefetcht2 32(B) prefetcht2 32(B, K, 8) prefetcht2 96(B) prefetcht2 96(B, K, 8) - addq $128, B + addq $128, B /* increment */ sarq $3, K decq I # i -- @@ -1880,6 +1883,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************** * Rest of M ***************************************************************************/ + /* recover the original value of pointer B */ movq M, I sarq $2, I salq $7, I @@ -2160,6 +2164,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 + /* here for the prefetch of next b source block */ + /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ + /* currently an increment of 128 byte is suitable */ salq $3, K prefetcht2 (B) prefetcht2 (B, K, 8) @@ -2175,7 +2182,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************** * Rest of M ***************************************************************************/ - + /* recover the original value of pointer B */ movq M, I sarq $2, I salq $7, I From f49f8047acbea636eb2a3542f306803a1285793b Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 20 Jul 2019 14:33:37 +0800 Subject: [PATCH 18/44] Add files via upload --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 50 ++++++++++++++++++++---- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 225af3673..6d1460bb2 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -279,30 +279,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm9 , %ymm9 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 +#if B_PR1 >= 96 prefetcht0 128 + BUFFER1 +#endif vmulpd %ymm0 , %ymm12, %ymm12 vmulpd %ymm0 , %ymm13, %ymm13 vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 +#if B_PR1 >= 160 prefetcht0 192 + BUFFER1 +#endif vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 - +#if B_PR1 >= 224 + prefetcht0 256 + BUFFER1 +#endif vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - +#if B_PR1 >= 288 + prefetcht0 320 + BUFFER1 +#endif vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 - +#if B_PR1 >= 352 + prefetcht0 384 + BUFFER1 +#endif vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - +#if B_PR1 >= 416 + prefetcht0 448 + BUFFER1 +#endif leaq (CO1, LDC, 2), %rax +#if B_PR1 >= 480 + prefetcht0 512 + BUFFER1 +#endif #if !defined(TRMMKERNEL) @@ -1867,13 +1882,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* here for the prefetch of next b source block */ /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ - /* currently an increment of 128 byte is suitable */ + salq $3, K +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + prefetcht2 32(B) + prefetcht2 32(B, K, 8) + addq $64, B /* increment */ +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ prefetcht2 32(B) prefetcht2 32(B, K, 8) prefetcht2 96(B) prefetcht2 96(B, K, 8) addq $128, B /* increment */ +#endif sarq $3, K decq I # i -- @@ -1883,10 +1904,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************** * Rest of M ***************************************************************************/ - /* recover the original value of pointer B */ + + /* recover the original value of pointer B after prefetch */ movq M, I sarq $2, I +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + salq $6, I +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ salq $7, I +#endif subq I, B .L12_20: @@ -2166,13 +2192,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* here for the prefetch of next b source block */ /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ - /* currently an increment of 128 byte is suitable */ + salq $3, K +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + prefetcht2 (B) + prefetcht2 (B, K, 8) + addq $64, B +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ prefetcht2 (B) prefetcht2 (B, K, 8) prefetcht2 64(B) prefetcht2 64(B, K, 8) addq $128, B +#endif sarq $3, K decq I # i -- @@ -2185,7 +2217,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* recover the original value of pointer B */ movq M, I sarq $2, I +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + salq $6, I +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ salq $7, I +#endif subq I, B .L13_20: From 94db259e5b432a7f1769c1d61071b9dd727778db Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 20 Jul 2019 22:04:41 +0800 Subject: [PATCH 19/44] Add files via upload --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 45 ++++++++++-------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 6d1460bb2..6a8619e32 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1622,35 +1622,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro PREFETCHT0_C + prefetcht0 ALPHA prefetcht0 (CO1) prefetcht0 24(CO1) prefetcht0 (CO1,LDC,4) prefetcht0 24(CO1,LDC,4) prefetcht0 (CO1,LDC,8) prefetcht0 24(CO1,LDC,8) - addq LDC,CO1 - prefetcht0 (CO1) - prefetcht0 24(CO1) - prefetcht0 (CO1,LDC,4) - prefetcht0 24(CO1,LDC,4) - prefetcht0 (CO1,LDC,8) - prefetcht0 24(CO1,LDC,8) - leaq (CO1,LDC,2),CO1 - prefetcht0 (CO1) - prefetcht0 24(CO1) - prefetcht0 (CO1,LDC,4) - prefetcht0 24(CO1,LDC,4) - prefetcht0 (CO1,LDC,8) - prefetcht0 24(CO1,LDC,8) - subq LDC,CO1 - prefetcht0 (CO1) - prefetcht0 24(CO1) - prefetcht0 (CO1,LDC,4) - prefetcht0 24(CO1,LDC,4) - prefetcht0 (CO1,LDC,8) - prefetcht0 24(CO1,LDC,8) - subq LDC,CO1 - subq LDC,CO1 .endm /*******************************************************************************************/ @@ -1820,12 +1798,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. dec %rax jne .L12_12 - PREFETCHT0_C .L12_12a: - + PREFETCHT0_C + addq LDC,CO1 KERNEL4x12_M1 + PREFETCHT0_C + leaq (CO1,LDC,2),CO1 KERNEL4x12_M2 + PREFETCHT0_C + subq LDC,CO1 KERNEL4x12_M1 + PREFETCHT0_C + subq LDC,CO1 + subq LDC,CO1 KERNEL4x12_M2 KERNEL4x12_M1 @@ -2133,9 +2118,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L13_12a: + PREFETCHT0_C + addq LDC,CO1 KERNEL4x12_M1 + PREFETCHT0_C + leaq (CO1,LDC,2),CO1 KERNEL4x12_M2 + PREFETCHT0_C + subq LDC,CO1 KERNEL4x12_M1 + PREFETCHT0_C + subq LDC,CO1 + subq LDC,CO1 KERNEL4x12_M2 KERNEL4x12_M1 @@ -2145,7 +2139,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. jmp .L13_16 - PREFETCHT0_C .L13_13: test $1, %rax From 9440fa607d146f1b91d70e35404f0d4abe50ffc5 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 20 Jul 2019 22:08:22 +0800 Subject: [PATCH 20/44] Add files via upload --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 6a8619e32..c834239be 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1622,7 +1622,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro PREFETCHT0_C - prefetcht0 ALPHA prefetcht0 (CO1) prefetcht0 24(CO1) prefetcht0 (CO1,LDC,4) @@ -1799,6 +1798,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. jne .L12_12 .L12_12a: + prefetcht0 ALPHA PREFETCHT0_C addq LDC,CO1 KERNEL4x12_M1 @@ -2117,7 +2117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. jne .L13_12 .L13_12a: - + prefetcht0 ALPHA PREFETCHT0_C addq LDC,CO1 KERNEL4x12_M1 From 4801c6d36bd87421b08e60efa1b6e0217fd41672 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sun, 21 Jul 2019 00:47:45 +0800 Subject: [PATCH 21/44] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index c834239be..26eea0acf 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1866,7 +1866,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 /* here for the prefetch of next b source block */ - /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ + /* the increment should be proportional to GEMM_Q/GEMM_P */ salq $3, K #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ @@ -2184,19 +2184,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 /* here for the prefetch of next b source block */ - /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ + /* the increment should be proportional to GEMM_Q/GEMM_P */ salq $3, K #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ prefetcht2 (B) prefetcht2 (B, K, 8) - addq $64, B + addq $64, B /* increment */ #else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ prefetcht2 (B) prefetcht2 (B, K, 8) prefetcht2 64(B) prefetcht2 64(B, K, 8) - addq $128, B + addq $128, B /* increment */ #endif sarq $3, K From 95fb98f556adcbbccc5f42318c7c645ec1837e1a Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sun, 21 Jul 2019 01:10:32 +0800 Subject: [PATCH 22/44] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 26eea0acf..082e62a7c 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -279,43 +279,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm9 , %ymm9 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 -#if B_PR1 >= 96 +#if B_PR1 > 32 prefetcht0 128 + BUFFER1 #endif vmulpd %ymm0 , %ymm12, %ymm12 vmulpd %ymm0 , %ymm13, %ymm13 vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 -#if B_PR1 >= 160 +#if B_PR1 > 96 prefetcht0 192 + BUFFER1 #endif vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 -#if B_PR1 >= 224 +#if B_PR1 > 160 prefetcht0 256 + BUFFER1 #endif vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 -#if B_PR1 >= 288 +#if B_PR1 > 224 prefetcht0 320 + BUFFER1 #endif vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 -#if B_PR1 >= 352 +#if B_PR1 > 288 prefetcht0 384 + BUFFER1 #endif vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 -#if B_PR1 >= 416 +#if B_PR1 > 352 prefetcht0 448 + BUFFER1 #endif leaq (CO1, LDC, 2), %rax -#if B_PR1 >= 480 +#if B_PR1 > 416 prefetcht0 512 + BUFFER1 #endif From 28e96458e5a4b2d8039ed16048a07892a7c960bf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 22 Jul 2019 08:28:16 +0200 Subject: [PATCH 23/44] Replace vpermpd with vpermilpd to improve performance on Zen/Zen2 (as demonstrated by wjc404 in #2180) --- kernel/x86_64/zdot_microk_haswell-2.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c index 9f2fc2c1d..4eade7bfd 100644 --- a/kernel/x86_64/zdot_microk_haswell-2.c +++ b/kernel/x86_64/zdot_microk_haswell-2.c @@ -66,13 +66,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" - "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x05 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t" +// "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" +// "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" - "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + "vpermilpd $0x05 , %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t" +// "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" +// "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" "vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r "addq $16 , %0 \n\t" @@ -151,13 +155,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" - "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x05 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t" +// "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" +// "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" - "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + "vpermilpd $0x05 , %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t" +// "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" +// "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" "vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r "addq $16 , %0 \n\t" From 3f6ab1582aca019cf5514aac3af98dcb66c9bbd6 Mon Sep 17 00:00:00 2001 From: Tyler Reddy Date: Mon, 22 Jul 2019 21:24:57 -0600 Subject: [PATCH 24/44] MAINT: remove legacy CMake endif() * clean up a case where CMake endif() contained the conditional used in the if(), which is no longer needed / discouraged since our minimum required CMake version supports the modern syntax --- cmake/system_check.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 94d3ba643..610f689e0 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -15,7 +15,7 @@ if (${HOST_OS} STREQUAL "LINUX") EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM) if(${OPERATING_SYSTEM} MATCHES "Android") set(HOST_OS ANDROID) - endif(${OPERATING_SYSTEM} MATCHES "Android") + endif() endif() From af2e7f28fce42e39fd3d4e108dfb4d55b377b5ee Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 23 Jul 2019 16:56:40 +0200 Subject: [PATCH 25/44] Override special make variables as seen in https://github.com/xianyi/OpenBLAS/issues/1912#issuecomment-514183900 , any external setting of TARGET_ARCH (which could result from building OpenBLAS as part of a larger project that actually uses this variable) would cause the utest build to fail. (Other subtargets appear to be unaffected as they do not use implicit make rules) --- utest/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utest/Makefile b/utest/Makefile index cbe639cdb..5846db0bb 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -1,6 +1,9 @@ UTEST_CHECK = 1 TOPDIR = .. +override TARGET_ARCH= +override TARGET_MACH= + UTESTBIN=openblas_utest .PHONY : all From 30efed14d1aa9e1fba887aeddac964b841dd4720 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 24 Jul 2019 15:26:09 +0200 Subject: [PATCH 26/44] Unset special make variables in ctest Makefile as well --- ctest/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ctest/Makefile b/ctest/Makefile index 569a5dda3..f562c9bb3 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -6,6 +6,8 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system override CFLAGS += -DADD$(BU) -DCBLAS +override TARGET_ARCH= +override TARGET_MACH= LIB = $(TOPDIR)/$(LIBNAME) From 7eecd8e39cfd3bf3f8eddc1154b8b2bfec19ea33 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sun, 28 Jul 2019 07:39:09 +0800 Subject: [PATCH 27/44] Add files via upload --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 334 ++++++++++++++++++++++- 1 file changed, 325 insertions(+), 9 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 082e62a7c..19e32ef2c 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -107,6 +107,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define A_PR1 512 #define B_PR1 160 +#define BROADCASTKERNEL /******************************************************************************************* * Macro definitions @@ -133,7 +134,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 A_PR1(AO) vmovups -12 * SIZE(BO), %ymm1 prefetcht0 B_PR1(BO) +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else vmovups -16 * SIZE(AO), %ymm0 +# endif prefetcht0 B_PR1+64(BO) vmovups -8 * SIZE(BO), %ymm2 prefetcht0 B_PR1+128(BO) @@ -143,17 +148,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 ,%ymm2 , %ymm8 vmulpd %ymm0 ,%ymm3 , %ymm12 prefetcht0 B_PR1+256(BO) +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vmulpd %ymm0 ,%ymm1 , %ymm5 vmulpd %ymm0 ,%ymm2 , %ymm9 vmulpd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vmulpd %ymm0 ,%ymm1 , %ymm6 vmulpd %ymm0 ,%ymm2 , %ymm10 addq $ 12*SIZE, BO vmulpd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vmulpd %ymm0 ,%ymm2 , %ymm11 @@ -165,23 +182,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x12_M1 prefetcht0 A_PR1(AO) +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else vmovups -16 * SIZE(AO), %ymm0 +# endif prefetcht0 B_PR1(BO) vfmadd231pd %ymm0 ,%ymm1 , %ymm4 prefetcht0 B_PR1+64(BO) vfmadd231pd %ymm0 ,%ymm2 , %ymm8 prefetcht0 B_PR1+128(BO) vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -192,21 +224,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x12_M2 +# if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +# else vmovups -12 * SIZE(AO), %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups 0 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -218,21 +266,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x12_E +# if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +# else vmovups -12 * SIZE(AO), %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 @@ -241,23 +305,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x12_SUB vmovups -12 * SIZE(BO), %ymm1 +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else vmovups -16 * SIZE(AO), %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vmovups -8 * SIZE(BO), %ymm2 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vmovups -4 * SIZE(BO), %ymm3 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 addq $ 12*SIZE, BO vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 4*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 @@ -289,27 +369,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if B_PR1 > 96 prefetcht0 192 + BUFFER1 #endif + +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 +#else vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 +#endif + #if B_PR1 > 160 prefetcht0 256 + BUFFER1 #endif + +#if defined BROADCASTKERNEL + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 +#endif + #if B_PR1 > 224 prefetcht0 320 + BUFFER1 #endif + +#ifndef BROADCASTKERNEL vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 +#endif + #if B_PR1 > 288 prefetcht0 384 + BUFFER1 #endif + +#ifndef BROADCASTKERNEL vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif + #if B_PR1 > 352 prefetcht0 448 + BUFFER1 #endif @@ -338,11 +444,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht1 56(%rax) prefetcht1 56(%rax,LDC) - vpermilpd $ 0x05 , %ymm9 , %ymm9 +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0 + vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1 + vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2 + vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm9, %ymm9 vpermilpd $ 0x05 , %ymm11, %ymm11 - vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 - vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 + vblendpd $ 0x0a, %ymm9, %ymm8, %ymm0 + vblendpd $ 0x05, %ymm9, %ymm8, %ymm1 vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 @@ -353,7 +469,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - +#endif leaq (%rax, LDC, 2), %rax leaq (%rax, LDC, 2), %rbp @@ -377,6 +493,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht1 56(%rbp) prefetcht1 56(%rbp,LDC) +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm14, %ymm12 , %ymm0 + vperm2f128 $ 0x20 , %ymm15, %ymm13 , %ymm1 + vperm2f128 $ 0x31 , %ymm14, %ymm12 , %ymm2 + vperm2f128 $ 0x31 , %ymm15, %ymm13 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else vpermilpd $ 0x05 , %ymm13, %ymm13 vpermilpd $ 0x05 , %ymm15, %ymm15 @@ -392,7 +518,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - +#endif leaq (%rax, LDC, 4), %rax leaq (%rbp, LDC, 4), %rbp @@ -693,19 +819,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_I vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vmovups -8 * SIZE(BO), %ymm2 vmulpd %ymm0 ,%ymm1 , %ymm4 vmulpd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm5 vmulpd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm6 vmulpd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vmulpd %ymm0 ,%ymm2 , %ymm11 @@ -715,19 +857,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_M1 prefetcht0 A_PR1(AO) +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif prefetcht0 B_PR1(BO) vfmadd231pd %ymm0 ,%ymm1 , %ymm4 prefetcht0 B_PR1+64(BO) vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -736,18 +893,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M2 +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else vmovups -12 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -4 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -757,18 +930,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_E +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else vmovups -12 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 addq $ 8*SIZE, BO @@ -776,19 +965,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_SUB vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vmovups -8 * SIZE(BO), %ymm2 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 addq $ 8*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 4*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -809,6 +1014,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 @@ -824,6 +1039,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif leaq (CO1, LDC, 2), %rax @@ -847,6 +1063,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 56(%rax) prefetcht0 56(%rax,LDC) +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0 + vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1 + vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2 + vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else vpermilpd $ 0x05 , %ymm9 , %ymm9 vpermilpd $ 0x05 , %ymm11, %ymm11 @@ -862,7 +1088,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - +#endif leaq (%rax, LDC, 2), %rax leaq (%rax, LDC, 2), %rbp @@ -1088,15 +1314,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I prefetcht0 A_PR1(AO) vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm6 addq $ 4*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 @@ -1104,29 +1346,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_M1 prefetcht0 A_PR1(AO) +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 .endm .macro KERNEL4x4_M2 +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else vmovups -12 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -8 * SIZE(BO), %ymm1 addq $ 8*SIZE, BO @@ -1134,30 +1407,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_E +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else vmovups -12 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 addq $ 4*SIZE, BO .endm .macro KERNEL4x4_SUB vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 addq $ 4*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 4*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 .endm @@ -1171,6 +1476,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 @@ -1186,6 +1501,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif leaq (CO1, LDC, 2), %rax From 2dfb804cb943ac12035fe51859d109daca76b4f4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Jul 2019 23:17:28 +0200 Subject: [PATCH 28/44] Replace vpermpd with vpermilpd in the Haswell DTRMM kernel to improve performance on AMD Zen (#2180) applying wjc404's improvement of the DGEMM kernel from #2186 --- kernel/x86_64/dtrmm_kernel_4x8_haswell.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/kernel/x86_64/dtrmm_kernel_4x8_haswell.c b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c index 651736b89..2acdc4615 100644 --- a/kernel/x86_64/dtrmm_kernel_4x8_haswell.c +++ b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c @@ -33,7 +33,7 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm4 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm8 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm5 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm9 \n\t" @@ -41,7 +41,7 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm6 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm10 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm7 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm11 \n\t" @@ -62,18 +62,16 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vmulpd %%ymm0 , %%ymm10, %%ymm10 \n\t" " vmulpd %%ymm0 , %%ymm11, %%ymm11 \n\t" - " vpermpd $0xb1 , %%ymm5 , %%ymm5 \n\t" - " vpermpd $0xb1 , %%ymm7 , %%ymm7 \n\t" + " vpermilpd $0x05 , %%ymm5 , %%ymm5 \n\t" + " vpermilpd $0x05 , %%ymm7 , %%ymm7 \n\t" " vblendpd $0x0a , %%ymm5 , %%ymm4 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm5 , %%ymm4 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm7 , %%ymm6 , %%ymm2 \n\t" " vblendpd $0x05 , %%ymm7 , %%ymm6 , %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t" @@ -85,18 +83,16 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vmovups %%ymm6 , (%7) \n\t" " vmovups %%ymm7 , (%8) \n\t" - " vpermpd $0xb1 , %%ymm9 , %%ymm9 \n\t" - " vpermpd $0xb1 , %%ymm11, %%ymm11 \n\t" + " vpermilpd $0x05 , %%ymm9 , %%ymm9 \n\t" + " vpermilpd $0x05 , %%ymm11, %%ymm11 \n\t" " vblendpd $0x0a , %%ymm9 , %%ymm8 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm9 , %%ymm8 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm11, %%ymm10, %%ymm2 \n\t" " vblendpd $0x05 , %%ymm11, %%ymm10, %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t" From 648491e1aa5cec7e8b8947d8ce47a825ceba705d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 Aug 2019 22:51:09 +0200 Subject: [PATCH 29/44] Autodetect Intel Ice Lake (as SKYLAKEX target) --- cpuid_x86.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 884d4b78a..141d6044e 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1211,7 +1211,7 @@ int get_cpuname(void){ return CPUTYPE_CORE2; } break; - case 1: + case 1: // family 6 exmodel 1 switch (model) { case 6: return CPUTYPE_CORE2; @@ -1228,7 +1228,7 @@ int get_cpuname(void){ return CPUTYPE_DUNNINGTON; } break; - case 2: + case 2: // family 6 exmodel 2 switch (model) { case 5: //Intel Core (Clarkdale) / Core (Arrandale) @@ -1257,7 +1257,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 3: + case 3: // family 6 exmodel 3 switch (model) { case 7: // Bay Trail @@ -1287,7 +1287,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 4: + case 4: // family 6 exmodel 4 switch (model) { case 5: case 6: @@ -1321,7 +1321,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 5: + case 5: // family 6 exmodel 5 switch (model) { case 6: //Broadwell @@ -1364,7 +1364,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 6: + case 6: // family 6 exmodel 6 switch (model) { case 6: // Cannon Lake if(support_avx512()) @@ -1376,7 +1376,20 @@ int get_cpuname(void){ else return CPUTYPE_NEHALEM; } - break; + break; + case 7: // family 6 exmodel 7 + switch (model) { + case 14: // Ice Lake + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } + break; case 9: case 8: switch (model) { From 3d36c4511693bfd7c117465a701c5ff1f19f8565 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 Aug 2019 22:52:35 +0200 Subject: [PATCH 30/44] Add CPUID identification of Intel Ice Lake --- driver/others/dynamic.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 045fc65b8..f1cd3c6e6 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -585,9 +585,27 @@ static gotoblas_t *get_coretype(void){ } } return NULL; + case 7: + if (model == 14) { + // Ice Lake + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); + return &gotoblas_HASWELL; + } + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } + return NULL; case 9: case 8: - if (model == 14 ) { // Kaby Lake + if (model == 14 ) { // Kaby Lake, Coffee Lake if(support_avx2()) return &gotoblas_HASWELL; if(support_avx()) { From acf6002ab242f98460845bb71db8fefdbdb26a1f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 3 Aug 2019 12:40:13 +0200 Subject: [PATCH 31/44] Replace most vpermpd calls in the Haswell DTRSM_RN kernel --- kernel/x86_64/dtrsm_kernel_RN_haswell.c | 36 +++++++++++-------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c index 9ab78fc8e..cb939e762 100644 --- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c +++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c @@ -132,7 +132,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "1: \n\t" " vmovups (%8,%1,4), %%ymm4 \n\t" // read a - " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm3 \n\t" // was vpermpd 0xb1 " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" @@ -143,7 +143,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t" " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1 - " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm3 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" @@ -160,7 +160,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" @@ -170,7 +170,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " addq $8, %1 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm11 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm15 \n\t" @@ -185,7 +185,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm9 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm13 \n\t" @@ -193,7 +193,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm11 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm15 \n\t" @@ -204,7 +204,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" @@ -212,42 +212,38 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm11 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm15 \n\t" "3: \n\t" - " vpermpd $0xb1 , %%ymm9 , %%ymm9 \n\t" - " vpermpd $0xb1 , %%ymm11, %%ymm11 \n\t" + " vpermilpd $0x05 , %%ymm9 , %%ymm9 \n\t" + " vpermilpd $0x05 , %%ymm11, %%ymm11 \n\t" " vblendpd $0x0a , %%ymm9 , %%ymm8 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm9 , %%ymm8 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm11, %%ymm10, %%ymm2 \n\t" " vblendpd $0x05 , %%ymm11, %%ymm10, %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm8 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm9 \n\t" " vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm10 \n\t" " vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm11 \n\t" - " vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" - " vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + " vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t" + " vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t" " vblendpd $0x0a , %%ymm13, %%ymm12, %%ymm0 \n\t" " vblendpd $0x05 , %%ymm13, %%ymm12, %%ymm1 \n\t" " vblendpd $0x0a , %%ymm15, %%ymm14, %%ymm2 \n\t" " vblendpd $0x05 , %%ymm15, %%ymm14, %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm12 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm13 \n\t" From 4e2f81cfa1f6dfa24912c3ff88470471b39b695e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Aug 2019 23:15:35 +0200 Subject: [PATCH 32/44] Provide more information on mmap/munmap failure for #2207 --- driver/others/memory.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index f67cb01f4..77d2b72fa 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2041,8 +2041,12 @@ static BLASULONG alloc_lock = 0UL; static void alloc_mmap_free(struct release_t *release){ +if (!release->address) return 0; + if (munmap(release -> address, BUFFER_SIZE)) { - printf("OpenBLAS : munmap failed\n"); + int errsv=errno; + perror("OpenBLAS : munmap failed:"); + printf("error code=%d,\trelease->address=%lx\n",errsv,release->address); } } @@ -2073,6 +2077,12 @@ static void *alloc_mmap(void *address){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif + } else { +#ifdef DEBUG + int errsv=errno; + perror("OpenBLAS : mmap failed:"); + printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); +#endif } #ifdef OS_LINUX From 1776ad82c01c0f9efeeda043eb02e10187084066 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 9 Aug 2019 00:08:11 +0200 Subject: [PATCH 33/44] Add files via upload --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 77d2b72fa..534d6d9fc 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2041,7 +2041,7 @@ static BLASULONG alloc_lock = 0UL; static void alloc_mmap_free(struct release_t *release){ -if (!release->address) return 0; +if (!release->address) return; if (munmap(release -> address, BUFFER_SIZE)) { int errsv=errno; From b7bbb02447ed612e380dc1ca6d6e7a26f48dc868 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Aug 2019 12:46:05 +0200 Subject: [PATCH 34/44] Silence two nuisance warnings from gcc --- cpuid_arm64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index a5e731d74..e8aa29813 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -94,7 +94,7 @@ int get_feature(char *search) if( p == NULL ) return 0; t = strtok(p," "); - while( t = strtok(NULL," ")) + while( (t = strtok(NULL," "))) { if (!strcmp(t, search)) { return(1); } } @@ -344,7 +344,7 @@ void get_features(void) if( p == NULL ) return; t = strtok(p," "); - while( t = strtok(NULL," ")) + while( (t = strtok(NULL," "))) { } From be147a9f28889d831019c6f860d501b2546e3771 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Aug 2019 16:24:39 +0200 Subject: [PATCH 35/44] Avoid adding a spurious dependency on the fortran runtime despite NOFORTRAN=1 for cases where a fortran compiler is present but not wanted (e.g. not fully functional) --- Makefile.system | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 16791bcc2..835c76e78 100644 --- a/Makefile.system +++ b/Makefile.system @@ -267,9 +267,10 @@ OBJCOPY = $(CROSS_SUFFIX)objcopy OBJCONV = $(CROSS_SUFFIX)objconv -# For detect fortran failed, only build BLAS. +# When fortran support was either not detected or actively deselected, only build BLAS. ifeq ($(NOFORTRAN), 1) NO_LAPACK = 1 +override FEXTRALIB = endif # From 7b6808b69ca706c724a4258e7cfb460b3c8c25a7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Aug 2019 23:28:13 +0200 Subject: [PATCH 36/44] Increment version to 0.3.8.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d7d9c2fce..74db77135 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 7.dev) +set(OpenBLAS_PATCH_VERSION 8.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 02d92039811af88acd7e2b3d9fe4726c9f1008f4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Aug 2019 23:28:47 +0200 Subject: [PATCH 37/44] Increment version to 0.3.8.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index a299588e0..c0941e488 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.7.dev +VERSION = 0.3.8.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 303869f5724bb86d722bc32f254a976625ea2046 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Aug 2019 23:31:36 +0200 Subject: [PATCH 38/44] Update with changes from 0.3.7 --- Changelog.txt | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 8df35d5c3..f160a4e13 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,46 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.7 +11-Aug 2019 + +common: + * having the gmake special variables TARGET_ARCH or TARGET_MACH + defined no longer causes build failures in ctest or utest + * defining NO_AFFINITY or USE_TLS to 0 in gmake builds no longer + has the same effect as setting them to 1 + * a new test program was added to allow checking the library for + thread safety + * a new option USE_LOCKING was added to ensure thread safety when + OpenBLAS itself is built without multithreading but will be + called from multiple threads. + * a build failure on Linux with glibc versions earlier than 2.5 + was fixed + * a runtime error with CPU enumeration (and NO_AFFINITY not set) + on glibc 2.6 was fixed + * NO_AFFINITY was added to the CMAKE options (and defaults to being + active on Linux, as in the gmake builds) + +x86_64: + * the build-time logic for detection of AVX512 availability in + the processor and compiler was fixed + * gmake builds on OSX now set the internal name of the library to + libopenblas.0.dylib (consistent with CMAKE) + * the Haswell DGEMM kernel received a significant speedup through + improved prefetch and load instructions + * performance of DGEMM, DTRMM, DTRSM and ZDOT on Zen/Zen2 was markedly + increased by avoiding vpermpd instructions + * the SKYLAKEX (AVX512) DGEMM helper functions have now been disabled + to fix remaining errors in DGEMM, DSYMM and DTRMM + +## POWER: + * added support for building on FreeBSD/powerpc64 and FreeBSD/ppc970 + * added optimized kernels for POWER9 SGEMM and STRMM + +## ARMV7: + * fixed the softfp implementations of xAMAX and IxAMAX + * removed the predefined -march= flags on both ARMV5 and ARMV6 as + they were appropriate for only a subset of platforms + ==================================================================== Version 0.3.6 29-Apr-2019 From aef9804089b0c968806a0fc3cfb0219359ce42b2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Aug 2019 10:19:10 +0200 Subject: [PATCH 39/44] Fix unwanted case-sensitivity in x86 LSAME for (AMD) processors without CMOV Problem was already noticed some years ago in #238, but back then the problem was only corrected in one of the #ifdef branches. Fixes #2214 --- kernel/x86/lsame.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86/lsame.S b/kernel/x86/lsame.S index 3ac7a7314..2a2ab2bb5 100644 --- a/kernel/x86/lsame.S +++ b/kernel/x86/lsame.S @@ -56,13 +56,13 @@ #ifndef HAVE_CMOV movl %eax, %ecx subl $32, %ecx - jle .L1 + jl .L1 movl %ecx, %eax .L1: movl %edx, %ecx subl $32, %ecx - jle .L2 + jl .L2 movl %ecx, %edx .L2: subl %eax, %edx From a1fce677435a79d3cb577086793556d87ff76552 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Aug 2019 22:29:48 +0200 Subject: [PATCH 40/44] Make the new DGEMM regression test properly depend on CBLAS and LAPACKE fixes #2215 --- utest/CMakeLists.txt | 5 +++++ utest/Makefile | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 4e647cadc..1e3051a8f 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -38,9 +38,14 @@ if (NOT NO_LAPACK) set(OpenBLAS_utest_src ${OpenBLAS_utest_src} test_potrs.c + ) +if (NOT NO_CBLAS AND NOT NO_LAPACKE) +set(OpenBLAS_utest_src + ${OpenBLAS_utest_src} test_kernel_regress.c ) endif() +endif() set(OpenBLAS_utest_bin openblas_utest) add_executable(${OpenBLAS_utest_bin} ${OpenBLAS_utest_src}) diff --git a/utest/Makefile b/utest/Makefile index cbe639cdb..8c7e6b9f8 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -1,6 +1,9 @@ UTEST_CHECK = 1 TOPDIR = .. +override TARGET_ARCH= +override TARGET_MACH= + UTESTBIN=openblas_utest .PHONY : all @@ -13,8 +16,12 @@ OBJS=utest_main.o test_amax.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o ifneq ($(NO_LAPACK), 1) OBJS += test_potrs.o +ifneq ($(NO_CBLAS), 1) +ifneq ($(NO_LAPACKE), 1) OBJS += test_kernel_regress.o endif +endif +endif #this does not work with OpenMP nor with native Windows or Android threads # FIXME TBD if this works on OSX, SunOS, POWER and zarch From 9ef96b32a6cc9a41908e832f2f713462bb94f40f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Aug 2019 22:09:12 +0200 Subject: [PATCH 41/44] Add multithreading support to the x86_64 zdot kernel (#2222) * Add multithreading support copied from the ThunderX2T99 kernel. For #2221 --- kernel/x86_64/zdot.c | 90 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 74 insertions(+), 16 deletions(-) diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index ef12569c8..48f855b0e 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -86,18 +86,26 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) #endif -OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) + +#if defined(SMP) +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, + BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, + void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif + + + +static void zdot_compute (BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,OPENBLAS_COMPLEX_FLOAT *result) { BLASLONG i; BLASLONG ix,iy; FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; - + if ( n <= 0 ) { -// CREAL(result) = 0.0 ; -// CIMAG(result) = 0.0 ; - OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); - return(result); + OPENBLAS_COMPLEX_FLOAT res=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); + *result=res; + return; } @@ -150,18 +158,68 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA } #if !defined(CONJ) - OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); -// CREAL(result) = dot[0] - dot[1]; -// CIMAG(result) = dot[2] + dot[3]; + OPENBLAS_COMPLEX_FLOAT res=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); #else - OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); -// CREAL(result) = dot[0] + dot[1]; -// CIMAG(result) = dot[2] - dot[3]; - + OPENBLAS_COMPLEX_FLOAT res=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); #endif - - return(result); - + *result=res; + return; } +#if defined(SMP) +static int zdot_thread_function(BLASLONG n, BLASLONG dummy0, +BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, +BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) +{ + zdot_compute(n, x, inc_x, y, inc_y, (void *)result); + return 0; +} +#endif + +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha; +#endif + OPENBLAS_COMPLEX_FLOAT zdot; + CREAL(zdot) = 0.0; + CIMAG(zdot) = 0.0; + +#if defined(SMP) + if (inc_x == 0 || inc_y == 0 || n <= 10000) + nthreads = 1; + else + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { + zdot_compute(n, x, inc_x, y, inc_y, &zdot); + } else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) * 2]; + OPENBLAS_COMPLEX_FLOAT *ptr; + +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_COMPLEX; +#else + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#endif + + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, + x, inc_x, y, inc_y, result, 0, + ( void *)zdot_thread_function, nthreads); + + ptr = (OPENBLAS_COMPLEX_FLOAT *)result; + for (i = 0; i < nthreads; i++) { + CREAL(zdot) = CREAL(zdot) + CREAL(*ptr); + CIMAG(zdot) = CIMAG(zdot) + CIMAG(*ptr); + ptr = (void *)(((char *)ptr) + sizeof(double) * 2); + } + } +#else + zdot_compute(n, x, inc_x, y, inc_y, &zdot); +#endif + + return zdot; +} From e3d846ab57eabadf5b933e8ca66d0b2c62e23e4c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 16 Aug 2019 08:58:10 +0200 Subject: [PATCH 42/44] Do not use -march=native with the PGI compiler --- cmake/system.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 1c2093efe..4f8011603 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -66,7 +66,7 @@ if (DEFINED TARGET) endif () # On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. -if (X86_64) +if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI") set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native") endif () From a95a5e52b8df842f0ec23c6d0ad9b299c1318ab4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 16 Aug 2019 09:00:11 +0200 Subject: [PATCH 43/44] Fix PGI compiler detection for getarch --- Makefile.system | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 6addbdad5..a54282f6c 100644 --- a/Makefile.system +++ b/Makefile.system @@ -142,9 +142,9 @@ endif endif -# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. +# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch. ifeq ($(ARCH), x86_64) -ifneq ($(C_COMPILER), PGI) +ifeq ($(findstring pgcc,$(HOSTCC)),) GETARCH_FLAGS += -march=native endif endif From 6d8595351c8452f32dc015cda30cf1d4983d2447 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 19 Aug 2019 14:19:21 +0200 Subject: [PATCH 44/44] Add Intel Goldmont Plus CPUID fixes #2227 --- cpuid_x86.c | 29 +- dynamic.c | 897 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 919 insertions(+), 7 deletions(-) create mode 100644 dynamic.c diff --git a/cpuid_x86.c b/cpuid_x86.c index 884d4b78a..8c954bf21 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1211,7 +1211,7 @@ int get_cpuname(void){ return CPUTYPE_CORE2; } break; - case 1: + case 1: // family 6 exmodel 1 switch (model) { case 6: return CPUTYPE_CORE2; @@ -1228,7 +1228,7 @@ int get_cpuname(void){ return CPUTYPE_DUNNINGTON; } break; - case 2: + case 2: // family 6 exmodel 2 switch (model) { case 5: //Intel Core (Clarkdale) / Core (Arrandale) @@ -1257,7 +1257,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 3: + case 3: // family 6 exmodel 3 switch (model) { case 7: // Bay Trail @@ -1287,7 +1287,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 4: + case 4: // family 6 exmodel 4 switch (model) { case 5: case 6: @@ -1321,7 +1321,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 5: + case 5: // family 6 exmodel 5 switch (model) { case 6: //Broadwell @@ -1364,7 +1364,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 6: + case 6: // family 6 exmodel 6 switch (model) { case 6: // Cannon Lake if(support_avx512()) @@ -1376,7 +1376,22 @@ int get_cpuname(void){ else return CPUTYPE_NEHALEM; } - break; + break; + case 7: // family 6 exmodel 7 + switch (model) { + case 10: // Goldmont Plus + return CPUTYPE_NEHALEM; + case 14: // Ice Lake + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } + break; case 9: case 8: switch (model) { diff --git a/dynamic.c b/dynamic.c new file mode 100644 index 000000000..aa2b87621 --- /dev/null +++ b/dynamic.c @@ -0,0 +1,897 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#ifdef _MSC_VER +#define strncasecmp _strnicmp +#define strcasecmp _stricmp +#endif + +#ifdef ARCH_X86 +#define EXTERN extern +#else +#define EXTERN +#endif + +#ifdef DYNAMIC_LIST +extern gotoblas_t gotoblas_PRESCOTT; + +#ifdef DYN_ATHLON +extern gotoblas_t gotoblas_ATHLON; +#else +#define gotoblas_ATHLON gotoblas_PRESCOTT +#endif +#ifdef DYN_KATMAI +extern gotoblas_t gotoblas_KATMAI; +#else +#define gotoblas_KATMAI gotoblas_PRESCOTT +#endif +#ifdef DYN_BANIAS +extern gotoblas_t gotoblas_BANIAS; +#else +#define gotoblas_BANIAS gotoblas_PRESCOTT +#endif +#ifdef DYN_COPPERMINE +extern gotoblas_t gotoblas_COPPERMINE; +#else +#define gotoblas_COPPERMINE gotoblas_PRESCOTT +#endif +#ifdef DYN_NORTHWOOD +extern gotoblas_t gotoblas_NORTHWOOD; +#else +#define gotoblas_NORTHWOOD gotoblas_PRESCOTT +#endif +#ifdef DYN_CORE2 +extern gotoblas_t gotoblas_CORE2; +#else +#define gotoblas_CORE2 gotoblas_PRESCOTT +#endif +#ifdef DYN_NEHALEM +extern gotoblas_t gotoblas_NEHALEM; +#else +#define gotoblas_NEHALEM gotoblas_PRESCOTT +#endif +#ifdef DYN_BARCELONA +extern gotoblas_t gotoblas_BARCELONA; +#elif defined(DYN_NEHALEM) +#define gotoblas_BARCELONA gotoblas_NEHALEM +#else +#define gotoblas_BARCELONA gotoblas_PRESCOTT +#endif +#ifdef DYN_ATOM +extern gotoblas_t gotoblas_ATOM; +elif defined(DYN_NEHALEM) +#define gotoblas_ATOM gotoblas_NEHALEM +#else +#define gotoblas_ATOM gotoblas_PRESCOTT +#endif +#ifdef DYN_NANO +extern gotoblas_t gotoblas_NANO; +#else +#define gotoblas_NANO gotoblas_PRESCOTT +#endif +#ifdef DYN_PENRYN +extern gotoblas_t gotoblas_PENRYN; +#else +#define gotoblas_PENRYN gotoblas_PRESCOTT +#endif +#ifdef DYN_DUNNINGTON +extern gotoblas_t gotoblas_DUNNINGTON; +#else +#define gotoblas_DUNNINGTON gotoblas_PRESCOTT +#endif +#ifdef DYN_OPTERON +extern gotoblas_t gotoblas_OPTERON; +#else +#define gotoblas_OPTERON gotoblas_PRESCOTT +#endif +#ifdef DYN_OPTERON_SSE3 +extern gotoblas_t gotoblas_OPTERON_SSE3; +#else +#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT +#endif +#ifdef DYN_BOBCAT +extern gotoblas_t gotoblas_BOBCAT; +#elif defined(DYN_NEHALEM) +#define gotoblas_BOBCAT gotoblas_NEHALEM +#else +#define gotoblas_BOBCAT gotoblas_PRESCOTT +#endif +#ifdef DYN_SANDYBRIDGE +extern gotoblas_t gotoblas_SANDYBRIDGE; +#elif defined(DYN_NEHALEM) +#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM +#else +#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT +#endif +#ifdef DYN_BULLDOZER +extern gotoblas_t gotoblas_BULLDOZER; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_BULLDOZER gotoblas_NEHALEM +#else +#define gotoblas_BULLDOZER gotoblas_PRESCOTT +#endif +#ifdef DYN_PILEDRIVER +extern gotoblas_t gotoblas_PILEDRIVER; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_PILEDRIVER gotoblas_NEHALEM +#else +#define gotoblas_PILEDRIVER gotoblas_PRESCOTT +#endif +#ifdef DYN_STEAMROLLER +extern gotoblas_t gotoblas_STEAMROLLER; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_STEAMROLLER gotoblas_NEHALEM +#else +#define gotoblas_STEAMROLLER gotoblas_PRESCOTT +#endif +#ifdef DYN_EXCAVATOR +extern gotoblas_t gotoblas_EXCAVATOR; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_EXCAVATOR gotoblas_NEHALEM +#else +#define gotoblas_EXCAVATOR gotoblas_PRESCOTT +#endif +#ifdef DYN_HASWELL +extern gotoblas_t gotoblas_HASWELL; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_HASWELL gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_HASWELL gotoblas_NEHALEM +#else +#define gotoblas_HASWELL gotoblas_PRESCOTT +#endif +#ifdef DYN_ZEN +extern gotoblas_t gotoblas_ZEN; +#elif defined(DYN_HASWELL) +#define gotoblas_ZEN gotoblas_HASWELL +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_ZEN gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_ZEN gotoblas_NEHALEM +#else +#define gotoblas_ZEN gotoblas_PRESCOTT +#endif +#ifdef DYN_SKYLAKEX +extern gotoblas_t gotoblas_SKYLAKEX; +#elif defined(DYN_HASWELL) +#define gotoblas_SKYLAKEX gotoblas_HASWELL +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_SKYLAKEX gotoblas_NEHALEM +#else +#define gotoblas_SKYLAKEX gotoblas_PRESCOTT +#endif + + +#else // not DYNAMIC_LIST +EXTERN gotoblas_t gotoblas_KATMAI; +EXTERN gotoblas_t gotoblas_COPPERMINE; +EXTERN gotoblas_t gotoblas_NORTHWOOD; +EXTERN gotoblas_t gotoblas_BANIAS; +EXTERN gotoblas_t gotoblas_ATHLON; + +extern gotoblas_t gotoblas_PRESCOTT; +extern gotoblas_t gotoblas_CORE2; +extern gotoblas_t gotoblas_NEHALEM; +extern gotoblas_t gotoblas_BARCELONA; +#ifdef DYNAMIC_OLDER +extern gotoblas_t gotoblas_ATOM; +extern gotoblas_t gotoblas_NANO; +extern gotoblas_t gotoblas_PENRYN; +extern gotoblas_t gotoblas_DUNNINGTON; +extern gotoblas_t gotoblas_OPTERON; +extern gotoblas_t gotoblas_OPTERON_SSE3; +extern gotoblas_t gotoblas_BOBCAT; +#else +#define gotoblas_ATOM gotoblas_NEHALEM +#define gotoblas_NANO gotoblas_NEHALEM +#define gotoblas_PENRYN gotoblas_CORE2 +#define gotoblas_DUNNINGTON gotoblas_CORE2 +#define gotoblas_OPTERON gotoblas_CORE2 +#define gotoblas_OPTERON_SSE3 gotoblas_CORE2 +#define gotoblas_BOBCAT gotoblas_CORE2 +#endif + +#ifndef NO_AVX +extern gotoblas_t gotoblas_SANDYBRIDGE; +extern gotoblas_t gotoblas_BULLDOZER; +extern gotoblas_t gotoblas_PILEDRIVER; +extern gotoblas_t gotoblas_STEAMROLLER; +extern gotoblas_t gotoblas_EXCAVATOR; +#ifdef NO_AVX2 +#define gotoblas_HASWELL gotoblas_SANDYBRIDGE +#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE +#define gotoblas_ZEN gotoblas_SANDYBRIDGE +#else +extern gotoblas_t gotoblas_HASWELL; +extern gotoblas_t gotoblas_ZEN; +#ifndef NO_AVX512 +extern gotoblas_t gotoblas_SKYLAKEX; +#else +#define gotoblas_SKYLAKEX gotoblas_HASWELL +#endif +#endif +#else +//Use NEHALEM kernels for sandy bridge +#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM +#define gotoblas_HASWELL gotoblas_NEHALEM +#define gotoblas_SKYLAKEX gotoblas_NEHALEM +#define gotoblas_BULLDOZER gotoblas_BARCELONA +#define gotoblas_PILEDRIVER gotoblas_BARCELONA +#define gotoblas_STEAMROLLER gotoblas_BARCELONA +#define gotoblas_EXCAVATOR gotoblas_BARCELONA +#define gotoblas_ZEN gotoblas_BARCELONA +#endif + +#endif // DYNAMIC_LIST + +#define VENDOR_INTEL 1 +#define VENDOR_AMD 2 +#define VENDOR_CENTAUR 3 +#define VENDOR_HYGON 4 +#define VENDOR_UNKNOWN 99 + +#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) + +#ifndef NO_AVX +static inline void xgetbv(int op, int * eax, int * edx){ + //Use binary code for xgetbv + __asm__ __volatile__ + (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); +} +#endif + +int support_avx(){ +#ifndef NO_AVX + int eax, ebx, ecx, edx; + int ret=0; + + cpuid(1, &eax, &ebx, &ecx, &edx); + if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){ + xgetbv(0, &eax, &edx); + if((eax & 6) == 6){ + ret=1; //OS support AVX + } + } + return ret; +#else + return 0; +#endif +} + +int support_avx2(){ +#ifndef NO_AVX2 + int eax, ebx, ecx=0, edx; + int ret=0; + + if (!support_avx()) + return 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + if((ebx & (1<<7)) != 0) + ret=1; //OS supports AVX2 + return ret; +#else + return 0; +#endif +} + +int support_avx512(){ +#if !defined(NO_AVX) && !defined(NO_AVX512) + int eax, ebx, ecx, edx; + int ret=0; + + if (!support_avx()) + return 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + if((ebx & (1<<7)) != 1){ + ret=0; //OS does not even support AVX2 + } + if((ebx & (1<<31)) != 0){ + xgetbv(0, &eax, &edx); + if((eax & 0xe0) == 0xe0) + ret=1; //OS supports AVX512VL + } + return ret; +#else + return 0; +#endif +} + +extern void openblas_warning(int verbose, const char * msg); +#define FALLBACK_VERBOSE 1 +#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" +#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n" +#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n" +#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n" + +static int get_vendor(void){ + int eax, ebx, ecx, edx; + + union + { + char vchar[16]; + int vint[4]; + } vendor; + + cpuid(0, &eax, &ebx, &ecx, &edx); + + *(&vendor.vint[0]) = ebx; + *(&vendor.vint[1]) = edx; + *(&vendor.vint[2]) = ecx; + + vendor.vchar[12] = '\0'; + + if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; + if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; + if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; + if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; + + if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; + + return VENDOR_UNKNOWN; +} + +static gotoblas_t *get_coretype(void){ + + int eax, ebx, ecx, edx; + int family, exfamily, model, vendor, exmodel; + + cpuid(1, &eax, &ebx, &ecx, &edx); + + family = BITMASK(eax, 8, 0x0f); + exfamily = BITMASK(eax, 20, 0xff); + model = BITMASK(eax, 4, 0x0f); + exmodel = BITMASK(eax, 16, 0x0f); + + vendor = get_vendor(); + + if (vendor == VENDOR_INTEL){ + switch (family) { + case 0x6: + switch (exmodel) { + case 0: + if (model <= 0x7) return &gotoblas_KATMAI; + if ((model == 0x8) || (model == 0xa) || (model == 0xb)) return &gotoblas_COPPERMINE; + if ((model == 0x9) || (model == 0xd)) return &gotoblas_BANIAS; + if (model == 14) return &gotoblas_BANIAS; + if (model == 15) return &gotoblas_CORE2; + return NULL; + + case 1: + if (model == 6) return &gotoblas_CORE2; + if (model == 7) return &gotoblas_PENRYN; + if (model == 13) return &gotoblas_DUNNINGTON; + if ((model == 10) || (model == 11) || (model == 14) || (model == 15)) return &gotoblas_NEHALEM; + if (model == 12) return &gotoblas_ATOM; + return NULL; + + case 2: + //Intel Core (Clarkdale) / Core (Arrandale) + // Pentium (Clarkdale) / Pentium Mobile (Arrandale) + // Xeon (Clarkdale), 32nm + if (model == 5) return &gotoblas_NEHALEM; + + //Intel Xeon Processor 5600 (Westmere-EP) + //Xeon Processor E7 (Westmere-EX) + //Xeon E7540 + if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM; + + //Intel Core i5-2000 /i7-2000 (Sandy Bridge) + //Intel Core i7-3000 / Xeon E5 + if (model == 10 || model == 13) { + if(support_avx()) + return &gotoblas_SANDYBRIDGE; + else{ + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + return NULL; + case 3: + //Intel Sandy Bridge 22nm (Ivy Bridge?) + if (model == 10 || model == 14) { + if(support_avx()) + return &gotoblas_SANDYBRIDGE; + else{ + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Intel Haswell + if (model == 12 || model == 15) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Intel Broadwell + if (model == 13) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + if (model == 7) return &gotoblas_ATOM; //Bay Trail + return NULL; + case 4: + //Intel Haswell + if (model == 5 || model == 6) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Intel Broadwell + if (model == 7 || model == 15) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Intel Skylake + if (model == 14) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Intel Braswell / Avoton + if (model == 12 || model == 13) { + return &gotoblas_NEHALEM; + } + return NULL; + case 5: + //Intel Broadwell + if (model == 6) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + if (model == 5) { + // Intel Skylake X + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); + return &gotoblas_HASWELL; + } + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } + //Intel Skylake + if (model == 14) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Intel Phi Knights Landing + if (model == 7) { + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); + return &gotoblas_HASWELL; + } + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Apollo Lake or Denverton + if (model == 12 || model == 15) { + return &gotoblas_NEHALEM; + } + return NULL; + case 6: + if (model == 6) { + // Cannon Lake + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } + return NULL; + case 7: + if (model == 10) // Goldmont plus + return &gotoblas_NEHALEM; + if (model == 14) { + // Ice Lake + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); + return &gotoblas_HASWELL; + } + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } + return NULL; + case 9: + case 8: + if (model == 14 ) { // Kaby Lake, Coffee Lake + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + return NULL; + } + case 0xf: + if (model <= 0x2) return &gotoblas_NORTHWOOD; + return &gotoblas_PRESCOTT; + } + } + + if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){ + if (family <= 0xe) { + // Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon + cpuid(0x80000000, &eax, &ebx, &ecx, &edx); + if ( (eax & 0xffff) >= 0x01) { + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0) + return NULL; + } + else + return NULL; + + return &gotoblas_ATHLON; + } + if (family == 0xf){ + if ((exfamily == 0) || (exfamily == 2)) { + if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; + else return &gotoblas_OPTERON; + } else if (exfamily == 5) { + return &gotoblas_BOBCAT; + } else if (exfamily == 6) { + if(model == 1){ + //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series + if(support_avx()) + return &gotoblas_BULLDOZER; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + }else if(model == 2 || model == 3){ + //AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300 + if(support_avx()) + return &gotoblas_PILEDRIVER; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + }else if(model == 5){ + if(support_avx()) + return &gotoblas_EXCAVATOR; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + }else if(model == 0 || model == 8){ + if (exmodel == 1) { + //AMD Trinity + if(support_avx()) + return &gotoblas_PILEDRIVER; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + }else if (exmodel == 3) { + //AMD STEAMROLLER + if(support_avx()) + return &gotoblas_STEAMROLLER; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + }else if (exmodel == 6) { + if(support_avx()) + return &gotoblas_EXCAVATOR; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + + } + } + } else if (exfamily == 8) { + if (model == 1 || model == 8) { + if(support_avx()) + return &gotoblas_ZEN; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + } + } else if (exfamily == 9) { + if(support_avx()) + return &gotoblas_ZEN; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + }else { + return &gotoblas_BARCELONA; + } + } + } + + if (vendor == VENDOR_CENTAUR) { + switch (family) { + case 0x6: + return &gotoblas_NANO; + } + } + + return NULL; +} + +static char *corename[] = { + "Unknown", + "Katmai", + "Coppermine", + "Northwood", + "Prescott", + "Banias", + "Atom", + "Core2", + "Penryn", + "Dunnington", + "Nehalem", + "Athlon", + "Opteron", + "Opteron_SSE3", + "Barcelona", + "Nano", + "Sandybridge", + "Bobcat", + "Bulldozer", + "Piledriver", + "Haswell", + "Steamroller", + "Excavator", + "Zen", + "SkylakeX" +}; + +char *gotoblas_corename(void) { + + if (gotoblas == &gotoblas_KATMAI) return corename[ 1]; + if (gotoblas == &gotoblas_COPPERMINE) return corename[ 2]; + if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3]; + if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4]; + if (gotoblas == &gotoblas_BANIAS) return corename[ 5]; + if (gotoblas == &gotoblas_ATOM) return corename[ 6]; + if (gotoblas == &gotoblas_CORE2) return corename[ 7]; + if (gotoblas == &gotoblas_PENRYN) return corename[ 8]; + if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9]; + if (gotoblas == &gotoblas_NEHALEM) return corename[10]; + if (gotoblas == &gotoblas_ATHLON) return corename[11]; + if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12]; + if (gotoblas == &gotoblas_OPTERON) return corename[13]; + if (gotoblas == &gotoblas_BARCELONA) return corename[14]; + if (gotoblas == &gotoblas_NANO) return corename[15]; + if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; + if (gotoblas == &gotoblas_BOBCAT) return corename[17]; + if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; + if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; + if (gotoblas == &gotoblas_HASWELL) return corename[20]; + if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; + if (gotoblas == &gotoblas_EXCAVATOR) return corename[22]; + if (gotoblas == &gotoblas_ZEN) return corename[23]; + if (gotoblas == &gotoblas_SKYLAKEX) return corename[24]; + return corename[0]; +} + + +static gotoblas_t *force_coretype(char *coretype){ + + int i ; + int found = -1; + char message[128]; + //char mname[20]; + + for ( i=1 ; i <= 24; i++) + { + if (!strncasecmp(coretype,corename[i],20)) + { + found = i; + break; + } + } + if (found < 0) + { + //strncpy(mname,coretype,20); + snprintf(message, 128, "Core not found: %s\n",coretype); + openblas_warning(1, message); + return(NULL); + } + + switch (found) + { + case 24: return (&gotoblas_SKYLAKEX); + case 23: return (&gotoblas_ZEN); + case 22: return (&gotoblas_EXCAVATOR); + case 21: return (&gotoblas_STEAMROLLER); + case 20: return (&gotoblas_HASWELL); + case 19: return (&gotoblas_PILEDRIVER); + case 18: return (&gotoblas_BULLDOZER); + case 17: return (&gotoblas_BOBCAT); + case 16: return (&gotoblas_SANDYBRIDGE); + case 15: return (&gotoblas_NANO); + case 14: return (&gotoblas_BARCELONA); + case 13: return (&gotoblas_OPTERON); + case 12: return (&gotoblas_OPTERON_SSE3); + case 11: return (&gotoblas_ATHLON); + case 10: return (&gotoblas_NEHALEM); + case 9: return (&gotoblas_DUNNINGTON); + case 8: return (&gotoblas_PENRYN); + case 7: return (&gotoblas_CORE2); + case 6: return (&gotoblas_ATOM); + case 5: return (&gotoblas_BANIAS); + case 4: return (&gotoblas_PRESCOTT); + case 3: return (&gotoblas_NORTHWOOD); + case 2: return (&gotoblas_COPPERMINE); + case 1: return (&gotoblas_KATMAI); + } + return(NULL); + +} + + + + +void gotoblas_dynamic_init(void) { + + char coremsg[128]; + char coren[22]; + char *p; + + + if (gotoblas) return; + + p = getenv("OPENBLAS_CORETYPE"); + if ( p ) + { + gotoblas = force_coretype(p); + } + else + { + gotoblas = get_coretype(); + } + +#ifdef ARCH_X86 + if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI; +#else + if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT; + /* sanity check, if 64bit pointer we can't have a 32 bit cpu */ + if (sizeof(void*) == 8) { + if (gotoblas == &gotoblas_KATMAI || + gotoblas == &gotoblas_COPPERMINE || + gotoblas == &gotoblas_NORTHWOOD || + gotoblas == &gotoblas_BANIAS || + gotoblas == &gotoblas_ATHLON) + gotoblas = &gotoblas_PRESCOTT; + } +#endif + + if (gotoblas && gotoblas -> init) { + strncpy(coren,gotoblas_corename(),20); + sprintf(coremsg, "Core: %s\n",coren); + openblas_warning(2, coremsg); + gotoblas -> init(); + } else { + openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); + exit(1); + } + +} + +void gotoblas_dynamic_quit(void) { + + gotoblas = NULL; + +}