From 5a9cce2bf6740110b93a534f876072f220d928d1 Mon Sep 17 00:00:00 2001 From: Fabrice Fontaine Date: Sun, 5 May 2019 18:37:28 +0200 Subject: [PATCH 01/43] Makefile.arm: remove -march flags The provided -march flags, especially for ARMv5 and ARMv6 may not necessarily match the needed ones: for ARMv5, it might be armv5, armv5te, armv5t, etc. If the wrong one is used, the incorrect toolchain sysroot can be used in a multilib toolchain. Therefore, let the user building OpenBLAS pass the appropriate -march flag. The other flags, such as -mfpu=vfp or -mfloat-abi=hard are kept, as they are actually required for the build to proceed (OpenBLAS uses VFP instructions, and assume an EABIhf ABI). [Peter: update for v0.2.20] Signed-off-by: Thomas Petazzoni Signed-off-by: Peter Korsgaard [Retrieved from: https://git.buildroot.net/buildroot/tree/package/openblas/0001-Makefile.arm-remove-march-flags.patch] Signed-off-by: Fabrice Fontaine --- Makefile.arm | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/Makefile.arm b/Makefile.arm index eedd39b73..b5d80f8e6 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -1,7 +1,7 @@ ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15)) ifeq ($(OSNAME), Android) -CCOMMON_OPT += -mfpu=neon -march=armv7-a -FCOMMON_OPT += -mfpu=neon -march=armv7-a +CCOMMON_OPT += -mfpu=neon +FCOMMON_OPT += -mfpu=neon else CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a @@ -9,11 +9,6 @@ endif endif ifeq ($(CORE), ARMV6) -CCOMMON_OPT += -mfpu=vfp -march=armv6 -FCOMMON_OPT += -mfpu=vfp -march=armv6 -endif - -ifeq ($(CORE), ARMV5) -CCOMMON_OPT += -march=armv5 -FCOMMON_OPT += -march=armv5 +CCOMMON_OPT += -mfpu=vfp +FCOMMON_OPT += -mfpu=vfp endif From 9086543f503f63d9107ce539650f28918b027015 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jul 2019 14:29:47 +0200 Subject: [PATCH 02/43] Utest needs CBLAS but not necessarily FORTRAN --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 50da721cd..d7d9c2fce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -211,7 +211,8 @@ if (USE_THREAD) target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT}) endif() -if (MSVC OR NOT NOFORTRAN) +#if (MSVC OR NOT NOFORTRAN) +if (NOT NO_CBLAS) # Broken without fortran on unix add_subdirectory(utest) endif() From ae9e8b131e27f65684cf4cb98e03b7df4b290142 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jul 2019 14:30:33 +0200 Subject: [PATCH 03/43] Add mingw builds to Appveyor config --- appveyor.yml | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 44a616aaa..2f9cc7b0b 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -35,7 +35,14 @@ environment: DYNAMIC_ARCH: ON WITH_FORTRAN: no - COMPILER: cl - + - COMPILER: MinGW64-gcc-7.2.0-mingw + DYNAMIC_ARCH: OFF + WITH_FORTRAN: ignore + - COMPILER: MinGW64-gcc-7.2.0 + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 + COMPILER: MinGW-gcc-5.3.0 + WITH_FORTRAN: ignore + install: - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force @@ -52,7 +59,14 @@ install: before_build: - ps: if (-Not (Test-Path .\build)) { mkdir build } - cd build + - set PATH=%PATH:C:\Program Files\Git\usr\bin;=% + - if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% + - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% + - if [%COMPILER%]==[MinGW64-gcc-7.2.0] set PATH=C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. + - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. + - if [%COMPILER%]==[MinGW64-gcc-7.2.0] cmake -G "MSYS Makefiles" -DBINARY=32 -DNOFORTRAN=1 .. + - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. @@ -64,3 +78,4 @@ test_script: - echo Running Test - cd utest - openblas_utest + From f69a0be712a9dccf5fcf433a734eb1371cb6189a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jul 2019 15:02:39 +0200 Subject: [PATCH 04/43] Add getarch flags to disable AVX on x86 (and other small fixes to match Makefile behaviour) --- cmake/system.cmake | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 7f3696286..1c2093efe 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -70,6 +70,13 @@ if (X86_64) set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native") endif () +# On x86 no AVX support is available +if (X86 OR X86_64) +if ((DEFINED BINARY AND BINARY EQUAL 32) OR ("$CMAKE_SIZEOF_VOID_P}" EQUAL "4")) + set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX -DNO_AVX2 -DNO_AVX512") +endif () +endif () + if (INTERFACE64) message(STATUS "Using 64-bit integers.") set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT") @@ -148,7 +155,9 @@ else() endif () include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") - +if (DEFINED BINARY) + message(STATUS "Compiling a ${BINARY}-bit binary.") +endif () if (NOT DEFINED NEED_PIC) set(NEED_PIC 1) endif () @@ -165,6 +174,9 @@ include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake") if (NOT NOFORTRAN) # Fortran Compiler dependent settings include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") +else () +set(NO_LAPACK 1) +set(NO_LAPACKE 1) endif () if (BINARY64) @@ -190,9 +202,14 @@ if (NEED_PIC) endif () if (DYNAMIC_ARCH) - set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") - if (DYNAMIC_OLDER) - set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") + if (X86 OR X86_64 OR ARM64 OR PPC) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") + if (DYNAMIC_OLDER) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") + endif () + else () + unset (DYNAMIC_ARCH) + message (STATUS "DYNAMIC_ARCH is not supported on the target architecture, removing") endif () endif () From 04d671aae2b452a0bf63837c289f8948c35eb675 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jul 2019 15:05:04 +0200 Subject: [PATCH 05/43] Make disabling DYNAMIC_ARCH on unsupported systems work needs to be unset in the cache for the change to have any effect --- cmake/arch.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index b4547b7c9..5a7434551 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -81,7 +81,8 @@ if (DYNAMIC_ARCH) endif () if (NOT DYNAMIC_CORE) - unset(DYNAMIC_ARCH) + message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options") + unset(DYNAMIC_ARCH CACHE) endif () endif () From 8fb76134bc0711634b410fa20d6eb113f8893a04 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jul 2019 15:07:15 +0200 Subject: [PATCH 06/43] Mingw32 needs leading underscore on object names (also copy BUNDERSCORE settings for FORTRAN from the corresponding Makefile) --- cmake/prebuild.cmake | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index a67c44bf5..e508a46c2 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -59,6 +59,9 @@ set(FU "") if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")) set(FU "_") endif() +if(MINGW AND NOT MINGW64) + set(FU "_") +endif() set(COMPILER_ID ${CMAKE_C_COMPILER_ID}) if (${COMPILER_ID} STREQUAL "GNU") @@ -82,6 +85,11 @@ endif () # f_check if (NOT NOFORTRAN) include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake") +else () + file(APPEND ${TARGET_CONF_TEMP} + "#define BUNDERSCORE _\n" + "#define NEEDBUNDERSCORE 1\n") + set(BU "_") endif () # Cannot run getarch on target if we are cross-compiling From b89c781637503ec66117eb3b887a3755d42f0f46 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jul 2019 16:04:45 +0200 Subject: [PATCH 07/43] Fix surprising behaviour of NO_AFFINITY=0 --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index 16791bcc2..09a648e4a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1124,8 +1124,12 @@ endif endif ifdef NO_AFFINITY +ifeq ($(NO_AFFINITY), 0) +override undefine NO_AFFINITY +else CCOMMON_OPT += -DNO_AFFINITY endif +endif ifdef FUNCTION_PROFILE CCOMMON_OPT += -DFUNCTION_PROFILE From b89d9762a29ac84422ebb6092584831efd85d355 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Mon, 8 Jul 2019 17:13:21 -0500 Subject: [PATCH 08/43] Change install_name on osx to match linux --- Makefile | 1 + Makefile.install | 3 ++- exports/Makefile | 8 ++++++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 07b08439e..60f189ef2 100644 --- a/Makefile +++ b/Makefile @@ -109,6 +109,7 @@ endif ifeq ($(OSNAME), Darwin) @$(MAKE) -C exports dyn @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib + @ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib endif ifeq ($(OSNAME), WINNT) @$(MAKE) -C exports dll diff --git a/Makefile.install b/Makefile.install index fefecd98d..8070b4729 100644 --- a/Makefile.install +++ b/Makefile.install @@ -83,7 +83,8 @@ ifeq ($(OSNAME), Darwin) @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ - ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib + ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \ + ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib endif ifeq ($(OSNAME), WINNT) @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" diff --git a/exports/Makefile b/exports/Makefile index b1348bd4a..d32e449df 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -105,6 +105,10 @@ $(LIBPREFIX).def : gensymbol libgoto_hpl.def : gensymbol perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) +ifeq ($(OSNAME), Darwin) +INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib +endif + ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) $(LIBDYNNAME) : ../$(LIBNAME) osx.def else @@ -114,9 +118,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def endif ifneq (,$(filter 1 2,$(NOFORTRAN))) #only build without Fortran - $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) else - $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) endif dllinit.$(SUFFIX) : dllinit.c From 0ba29fd2625dfe405a08005a22d0fa21293cc16c Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 00:46:51 +0800 Subject: [PATCH 09/43] Update dgemm_kernel_4x8_haswell.S for zen2 replaced a bunch of vpermpd instructions with vpermilpd and vperm2f128 --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 120 ++++++++++------------- 1 file changed, 54 insertions(+), 66 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index c84b599ce..5416018bb 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 ,%ymm2 , %ymm8 vmulpd %ymm0 ,%ymm3 , %ymm12 prefetcht0 B_PR1+256(BO) - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm5 vmulpd %ymm0 ,%ymm2 , %ymm9 vmulpd %ymm0 ,%ymm3 , %ymm13 @@ -153,7 +153,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addq $ 12*SIZE, BO vmulpd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vmulpd %ymm0 ,%ymm2 , %ymm11 @@ -172,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm8 prefetcht0 B_PR1+128(BO) vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 @@ -181,7 +181,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm10 vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -196,7 +196,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 @@ -206,7 +206,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups 0 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -222,7 +222,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 @@ -232,7 +232,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 @@ -247,7 +247,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vmovups -4 * SIZE(BO), %ymm3 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 addq $ 12*SIZE, BO @@ -257,7 +257,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 4*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 @@ -284,18 +284,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -324,18 +322,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 32(%rax) prefetcht0 32(%rax,LDC) - vpermpd $ 0xb1 , %ymm9 , %ymm9 - vpermpd $ 0xb1 , %ymm11, %ymm11 + vpermilpd $ 0x05 , %ymm9 , %ymm9 + vpermilpd $ 0x05 , %ymm11, %ymm11 vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -365,18 +361,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 32(%rbp) prefetcht0 32(%rbp,LDC) - vpermpd $ 0xb1 , %ymm13, %ymm13 - vpermpd $ 0xb1 , %ymm15, %ymm15 + vpermilpd $ 0x05 , %ymm13, %ymm13 + vpermilpd $ 0x05 , %ymm15, %ymm15 vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -687,7 +681,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -8 * SIZE(BO), %ymm2 vmulpd %ymm0 ,%ymm1 , %ymm4 vmulpd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm5 vmulpd %ymm0 ,%ymm2 , %ymm9 vpermpd $ 0x1b, %ymm0 , %ymm0 @@ -695,7 +689,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, BO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vmulpd %ymm0 ,%ymm2 , %ymm11 @@ -710,14 +704,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm1 , %ymm4 prefetcht0 B_PR1+64(BO) vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -729,7 +723,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vpermpd $ 0x1b, %ymm0 , %ymm0 @@ -737,7 +731,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -4 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -750,7 +744,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vpermpd $ 0x1b, %ymm0 , %ymm0 @@ -758,7 +752,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 addq $ 8*SIZE, BO @@ -770,7 +764,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vmovups -8 * SIZE(BO), %ymm2 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 addq $ 8*SIZE, BO @@ -778,7 +772,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 4*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -799,18 +793,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -839,18 +831,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 32(%rax) prefetcht0 32(%rax,LDC) - vpermpd $ 0xb1 , %ymm9 , %ymm9 - vpermpd $ 0xb1 , %ymm11, %ymm11 + vpermilpd $ 0x05 , %ymm9 , %ymm9 + vpermilpd $ 0x05 , %ymm11, %ymm11 vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -1084,13 +1074,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -12 * SIZE(BO), %ymm1 vmovups -16 * SIZE(AO), %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm6 addq $ 4*SIZE, BO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 @@ -1100,12 +1090,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 A_PR1(AO) vmovups -16 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 @@ -1114,13 +1104,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_M2 vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -8 * SIZE(BO), %ymm1 addq $ 8*SIZE, BO @@ -1130,13 +1120,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_E vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 addq $ 4*SIZE, BO .endm @@ -1145,13 +1135,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -12 * SIZE(BO), %ymm1 vmovups -16 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 addq $ 4*SIZE, BO vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 4*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 .endm @@ -1165,18 +1155,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 From 7a9050d6817dd63e4b3cb641566b03f069be47a9 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 00:55:06 +0800 Subject: [PATCH 10/43] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 5416018bb..b98610524 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -292,8 +292,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -330,8 +330,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -369,8 +369,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -801,8 +801,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -839,8 +839,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -1163,8 +1163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 From 182b06d6adb445d00066eff3b15da335ee1656bc Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 17:02:35 +0800 Subject: [PATCH 11/43] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 40 ++++++++++++------------ 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index b98610524..814a1c350 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -317,10 +317,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rax) vmovups %ymm7 , (%rax, LDC) - prefetcht0 32(CO1) - prefetcht0 32(CO1,LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) + prefetcht0 56(CO1) + prefetcht0 56(CO1,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) vpermilpd $ 0x05 , %ymm9 , %ymm9 vpermilpd $ 0x05 , %ymm11, %ymm11 @@ -356,10 +356,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) + prefetcht0 56(%rbp) + prefetcht0 56(%rbp,LDC) vpermilpd $ 0x05 , %ymm13, %ymm13 vpermilpd $ 0x05 , %ymm15, %ymm15 @@ -395,10 +395,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) + prefetcht0 56(%rbp) + prefetcht0 56(%rbp,LDC) addq $ 4*SIZE, CO1 .endm @@ -826,10 +826,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rax) vmovups %ymm7 , (%rax, LDC) - prefetcht0 32(CO1) - prefetcht0 32(CO1,LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) + prefetcht0 56(CO1) + prefetcht0 56(CO1,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) vpermilpd $ 0x05 , %ymm9 , %ymm9 vpermilpd $ 0x05 , %ymm11, %ymm11 @@ -865,10 +865,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) + prefetcht0 56(%rbp) + prefetcht0 56(%rbp,LDC) addq $ 4*SIZE, CO1 .endm From 1733f927e6b892610bda045538a42d495faa1af5 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 21:27:41 +0800 Subject: [PATCH 12/43] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 814a1c350..b30ecccea 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -106,7 +106,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define A_PR1 512 -#define B_PR1 512 +#define B_PR1 160 /******************************************************************************************* * Macro definitions From 211ab03b1402a3c39311b7ca769aaad736ca554c Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 22:39:15 +0800 Subject: [PATCH 13/43] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index b30ecccea..3f7f9a98e 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -267,23 +267,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x12 + prefetcht0 128(%rsp) /*BUFFER 1*/ vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 vmulpd %ymm0 , %ymm7 , %ymm7 - + prefetcht0 192(%rsp) vmulpd %ymm0 , %ymm8 , %ymm8 vmulpd %ymm0 , %ymm9 , %ymm9 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 - + prefetcht0 256(%rsp) vmulpd %ymm0 , %ymm12, %ymm12 vmulpd %ymm0 , %ymm13, %ymm13 vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 - + prefetcht0 320(%rsp) vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 From 8a074b39656636ebec5812532b486cf751231a3b Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 23:47:30 +0800 Subject: [PATCH 14/43] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 42 +++++++++++++++++++++--- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 3f7f9a98e..5242e3efe 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -267,24 +267,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x12 - prefetcht0 128(%rsp) /*BUFFER 1*/ + prefetcht0 BUFFER1 vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 vmulpd %ymm0 , %ymm7 , %ymm7 - prefetcht0 192(%rsp) + prefetcht0 64 + BUFFER1 vmulpd %ymm0 , %ymm8 , %ymm8 vmulpd %ymm0 , %ymm9 , %ymm9 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 - prefetcht0 256(%rsp) + prefetcht0 128 + BUFFER1 vmulpd %ymm0 , %ymm12, %ymm12 vmulpd %ymm0 , %ymm13, %ymm13 vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 - prefetcht0 320(%rsp) + prefetcht0 192 + BUFFER1 vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 @@ -1606,6 +1606,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm +.macro PREFETCHT0_C + prefetcht0 (CO1) + prefetcht0 24(CO1) + prefetcht0 (CO1,LDC,4) + prefetcht0 24(CO1,LDC,4) + prefetcht0 (CO1,LDC,8) + prefetcht0 24(CO1,LDC,8) + addq LDC,CO1 + prefetcht0 (CO1) + prefetcht0 24(CO1) + prefetcht0 (CO1,LDC,4) + prefetcht0 24(CO1,LDC,4) + prefetcht0 (CO1,LDC,8) + prefetcht0 24(CO1,LDC,8) + leaq (CO1,LDC,2),CO1 + prefetcht0 (CO1) + prefetcht0 24(CO1) + prefetcht0 (CO1,LDC,4) + prefetcht0 24(CO1,LDC,4) + prefetcht0 (CO1,LDC,8) + prefetcht0 24(CO1,LDC,8) + subq LDC,CO1 + prefetcht0 (CO1) + prefetcht0 24(CO1) + prefetcht0 (CO1,LDC,4) + prefetcht0 24(CO1,LDC,4) + prefetcht0 (CO1,LDC,8) + prefetcht0 24(CO1,LDC,8) + subq LDC,CO1 + subq LDC,CO1 +.endm /*******************************************************************************************/ #if !defined(TRMMKERNEL) @@ -1773,7 +1804,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. dec %rax jne .L12_12 - + + PREFETCHT0_C .L12_12a: KERNEL4x12_M1 From 9b04baeaeeaaaeba8c12e3fc2418ceaeca53ebb0 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 23:50:03 +0800 Subject: [PATCH 15/43] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 5242e3efe..42692f33b 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -318,10 +318,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rax) vmovups %ymm7 , (%rax, LDC) - prefetcht0 56(CO1) - prefetcht0 56(CO1,LDC) - prefetcht0 56(%rax) - prefetcht0 56(%rax,LDC) + prefetcht1 56(CO1) + prefetcht1 56(CO1,LDC) + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) vpermilpd $ 0x05 , %ymm9 , %ymm9 vpermilpd $ 0x05 , %ymm11, %ymm11 @@ -357,10 +357,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 56(%rax) - prefetcht0 56(%rax,LDC) - prefetcht0 56(%rbp) - prefetcht0 56(%rbp,LDC) + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) + prefetcht1 56(%rbp) + prefetcht1 56(%rbp,LDC) vpermilpd $ 0x05 , %ymm13, %ymm13 vpermilpd $ 0x05 , %ymm15, %ymm15 @@ -396,10 +396,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 56(%rax) - prefetcht0 56(%rax,LDC) - prefetcht0 56(%rbp) - prefetcht0 56(%rbp,LDC) + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) + prefetcht1 56(%rbp) + prefetcht1 56(%rbp,LDC) addq $ 4*SIZE, CO1 .endm From 9c89757562f43af48645a6563161909321077646 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 19 Jul 2019 23:47:58 +0800 Subject: [PATCH 16/43] Add files via upload --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 29 +++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 42692f33b..e26bddea3 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1865,6 +1865,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 + salq $3, K + prefetcht2 32(B) + prefetcht2 32(B, K, 8) + prefetcht2 96(B) + prefetcht2 96(B, K, 8) + addq $128, B + sarq $3, K + decq I # i -- jne .L12_11 ALIGN_4 @@ -1872,6 +1880,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************** * Rest of M ***************************************************************************/ + movq M, I + sarq $2, I + salq $7, I + subq I, B + .L12_20: // Test rest of M @@ -2102,7 +2115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. jmp .L13_16 - + PREFETCHT0_C .L13_13: test $1, %rax @@ -2147,6 +2160,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 + salq $3, K + prefetcht2 (B) + prefetcht2 (B, K, 8) + prefetcht2 64(B) + prefetcht2 64(B, K, 8) + addq $128, B + sarq $3, K + decq I # i -- jne .L13_11 ALIGN_4 @@ -2154,6 +2175,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************** * Rest of M ***************************************************************************/ + + movq M, I + sarq $2, I + salq $7, I + subq I, B + .L13_20: // Test rest of M From 825777faab163326f38a0e6203ef1fb6fa8de6af Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 19 Jul 2019 23:58:24 +0800 Subject: [PATCH 17/43] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index e26bddea3..225af3673 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1865,12 +1865,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 + /* here for the prefetch of next b source block */ + /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ + /* currently an increment of 128 byte is suitable */ salq $3, K prefetcht2 32(B) prefetcht2 32(B, K, 8) prefetcht2 96(B) prefetcht2 96(B, K, 8) - addq $128, B + addq $128, B /* increment */ sarq $3, K decq I # i -- @@ -1880,6 +1883,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************** * Rest of M ***************************************************************************/ + /* recover the original value of pointer B */ movq M, I sarq $2, I salq $7, I @@ -2160,6 +2164,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 + /* here for the prefetch of next b source block */ + /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ + /* currently an increment of 128 byte is suitable */ salq $3, K prefetcht2 (B) prefetcht2 (B, K, 8) @@ -2175,7 +2182,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************** * Rest of M ***************************************************************************/ - + /* recover the original value of pointer B */ movq M, I sarq $2, I salq $7, I From f49f8047acbea636eb2a3542f306803a1285793b Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 20 Jul 2019 14:33:37 +0800 Subject: [PATCH 18/43] Add files via upload --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 50 ++++++++++++++++++++---- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 225af3673..6d1460bb2 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -279,30 +279,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm9 , %ymm9 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 +#if B_PR1 >= 96 prefetcht0 128 + BUFFER1 +#endif vmulpd %ymm0 , %ymm12, %ymm12 vmulpd %ymm0 , %ymm13, %ymm13 vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 +#if B_PR1 >= 160 prefetcht0 192 + BUFFER1 +#endif vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 - +#if B_PR1 >= 224 + prefetcht0 256 + BUFFER1 +#endif vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - +#if B_PR1 >= 288 + prefetcht0 320 + BUFFER1 +#endif vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 - +#if B_PR1 >= 352 + prefetcht0 384 + BUFFER1 +#endif vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - +#if B_PR1 >= 416 + prefetcht0 448 + BUFFER1 +#endif leaq (CO1, LDC, 2), %rax +#if B_PR1 >= 480 + prefetcht0 512 + BUFFER1 +#endif #if !defined(TRMMKERNEL) @@ -1867,13 +1882,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* here for the prefetch of next b source block */ /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ - /* currently an increment of 128 byte is suitable */ + salq $3, K +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + prefetcht2 32(B) + prefetcht2 32(B, K, 8) + addq $64, B /* increment */ +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ prefetcht2 32(B) prefetcht2 32(B, K, 8) prefetcht2 96(B) prefetcht2 96(B, K, 8) addq $128, B /* increment */ +#endif sarq $3, K decq I # i -- @@ -1883,10 +1904,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************** * Rest of M ***************************************************************************/ - /* recover the original value of pointer B */ + + /* recover the original value of pointer B after prefetch */ movq M, I sarq $2, I +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + salq $6, I +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ salq $7, I +#endif subq I, B .L12_20: @@ -2166,13 +2192,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* here for the prefetch of next b source block */ /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ - /* currently an increment of 128 byte is suitable */ + salq $3, K +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + prefetcht2 (B) + prefetcht2 (B, K, 8) + addq $64, B +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ prefetcht2 (B) prefetcht2 (B, K, 8) prefetcht2 64(B) prefetcht2 64(B, K, 8) addq $128, B +#endif sarq $3, K decq I # i -- @@ -2185,7 +2217,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* recover the original value of pointer B */ movq M, I sarq $2, I +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + salq $6, I +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ salq $7, I +#endif subq I, B .L13_20: From 94db259e5b432a7f1769c1d61071b9dd727778db Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 20 Jul 2019 22:04:41 +0800 Subject: [PATCH 19/43] Add files via upload --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 45 ++++++++++-------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 6d1460bb2..6a8619e32 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1622,35 +1622,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro PREFETCHT0_C + prefetcht0 ALPHA prefetcht0 (CO1) prefetcht0 24(CO1) prefetcht0 (CO1,LDC,4) prefetcht0 24(CO1,LDC,4) prefetcht0 (CO1,LDC,8) prefetcht0 24(CO1,LDC,8) - addq LDC,CO1 - prefetcht0 (CO1) - prefetcht0 24(CO1) - prefetcht0 (CO1,LDC,4) - prefetcht0 24(CO1,LDC,4) - prefetcht0 (CO1,LDC,8) - prefetcht0 24(CO1,LDC,8) - leaq (CO1,LDC,2),CO1 - prefetcht0 (CO1) - prefetcht0 24(CO1) - prefetcht0 (CO1,LDC,4) - prefetcht0 24(CO1,LDC,4) - prefetcht0 (CO1,LDC,8) - prefetcht0 24(CO1,LDC,8) - subq LDC,CO1 - prefetcht0 (CO1) - prefetcht0 24(CO1) - prefetcht0 (CO1,LDC,4) - prefetcht0 24(CO1,LDC,4) - prefetcht0 (CO1,LDC,8) - prefetcht0 24(CO1,LDC,8) - subq LDC,CO1 - subq LDC,CO1 .endm /*******************************************************************************************/ @@ -1820,12 +1798,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. dec %rax jne .L12_12 - PREFETCHT0_C .L12_12a: - + PREFETCHT0_C + addq LDC,CO1 KERNEL4x12_M1 + PREFETCHT0_C + leaq (CO1,LDC,2),CO1 KERNEL4x12_M2 + PREFETCHT0_C + subq LDC,CO1 KERNEL4x12_M1 + PREFETCHT0_C + subq LDC,CO1 + subq LDC,CO1 KERNEL4x12_M2 KERNEL4x12_M1 @@ -2133,9 +2118,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L13_12a: + PREFETCHT0_C + addq LDC,CO1 KERNEL4x12_M1 + PREFETCHT0_C + leaq (CO1,LDC,2),CO1 KERNEL4x12_M2 + PREFETCHT0_C + subq LDC,CO1 KERNEL4x12_M1 + PREFETCHT0_C + subq LDC,CO1 + subq LDC,CO1 KERNEL4x12_M2 KERNEL4x12_M1 @@ -2145,7 +2139,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. jmp .L13_16 - PREFETCHT0_C .L13_13: test $1, %rax From 9440fa607d146f1b91d70e35404f0d4abe50ffc5 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 20 Jul 2019 22:08:22 +0800 Subject: [PATCH 20/43] Add files via upload --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 6a8619e32..c834239be 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1622,7 +1622,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro PREFETCHT0_C - prefetcht0 ALPHA prefetcht0 (CO1) prefetcht0 24(CO1) prefetcht0 (CO1,LDC,4) @@ -1799,6 +1798,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. jne .L12_12 .L12_12a: + prefetcht0 ALPHA PREFETCHT0_C addq LDC,CO1 KERNEL4x12_M1 @@ -2117,7 +2117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. jne .L13_12 .L13_12a: - + prefetcht0 ALPHA PREFETCHT0_C addq LDC,CO1 KERNEL4x12_M1 From 4801c6d36bd87421b08e60efa1b6e0217fd41672 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sun, 21 Jul 2019 00:47:45 +0800 Subject: [PATCH 21/43] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index c834239be..26eea0acf 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1866,7 +1866,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 /* here for the prefetch of next b source block */ - /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ + /* the increment should be proportional to GEMM_Q/GEMM_P */ salq $3, K #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ @@ -2184,19 +2184,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 /* here for the prefetch of next b source block */ - /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ + /* the increment should be proportional to GEMM_Q/GEMM_P */ salq $3, K #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ prefetcht2 (B) prefetcht2 (B, K, 8) - addq $64, B + addq $64, B /* increment */ #else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ prefetcht2 (B) prefetcht2 (B, K, 8) prefetcht2 64(B) prefetcht2 64(B, K, 8) - addq $128, B + addq $128, B /* increment */ #endif sarq $3, K From 95fb98f556adcbbccc5f42318c7c645ec1837e1a Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sun, 21 Jul 2019 01:10:32 +0800 Subject: [PATCH 22/43] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 26eea0acf..082e62a7c 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -279,43 +279,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm9 , %ymm9 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 -#if B_PR1 >= 96 +#if B_PR1 > 32 prefetcht0 128 + BUFFER1 #endif vmulpd %ymm0 , %ymm12, %ymm12 vmulpd %ymm0 , %ymm13, %ymm13 vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 -#if B_PR1 >= 160 +#if B_PR1 > 96 prefetcht0 192 + BUFFER1 #endif vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 -#if B_PR1 >= 224 +#if B_PR1 > 160 prefetcht0 256 + BUFFER1 #endif vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 -#if B_PR1 >= 288 +#if B_PR1 > 224 prefetcht0 320 + BUFFER1 #endif vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 -#if B_PR1 >= 352 +#if B_PR1 > 288 prefetcht0 384 + BUFFER1 #endif vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 -#if B_PR1 >= 416 +#if B_PR1 > 352 prefetcht0 448 + BUFFER1 #endif leaq (CO1, LDC, 2), %rax -#if B_PR1 >= 480 +#if B_PR1 > 416 prefetcht0 512 + BUFFER1 #endif From 28e96458e5a4b2d8039ed16048a07892a7c960bf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 22 Jul 2019 08:28:16 +0200 Subject: [PATCH 23/43] Replace vpermpd with vpermilpd to improve performance on Zen/Zen2 (as demonstrated by wjc404 in #2180) --- kernel/x86_64/zdot_microk_haswell-2.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c index 9f2fc2c1d..4eade7bfd 100644 --- a/kernel/x86_64/zdot_microk_haswell-2.c +++ b/kernel/x86_64/zdot_microk_haswell-2.c @@ -66,13 +66,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" - "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x05 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t" +// "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" +// "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" - "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + "vpermilpd $0x05 , %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t" +// "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" +// "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" "vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r "addq $16 , %0 \n\t" @@ -151,13 +155,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" - "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x05 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t" +// "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" +// "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" - "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + "vpermilpd $0x05 , %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t" +// "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" +// "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" "vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r "addq $16 , %0 \n\t" From 3f6ab1582aca019cf5514aac3af98dcb66c9bbd6 Mon Sep 17 00:00:00 2001 From: Tyler Reddy Date: Mon, 22 Jul 2019 21:24:57 -0600 Subject: [PATCH 24/43] MAINT: remove legacy CMake endif() * clean up a case where CMake endif() contained the conditional used in the if(), which is no longer needed / discouraged since our minimum required CMake version supports the modern syntax --- cmake/system_check.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 94d3ba643..610f689e0 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -15,7 +15,7 @@ if (${HOST_OS} STREQUAL "LINUX") EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM) if(${OPERATING_SYSTEM} MATCHES "Android") set(HOST_OS ANDROID) - endif(${OPERATING_SYSTEM} MATCHES "Android") + endif() endif() From af2e7f28fce42e39fd3d4e108dfb4d55b377b5ee Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 23 Jul 2019 16:56:40 +0200 Subject: [PATCH 25/43] Override special make variables as seen in https://github.com/xianyi/OpenBLAS/issues/1912#issuecomment-514183900 , any external setting of TARGET_ARCH (which could result from building OpenBLAS as part of a larger project that actually uses this variable) would cause the utest build to fail. (Other subtargets appear to be unaffected as they do not use implicit make rules) --- utest/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utest/Makefile b/utest/Makefile index cbe639cdb..5846db0bb 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -1,6 +1,9 @@ UTEST_CHECK = 1 TOPDIR = .. +override TARGET_ARCH= +override TARGET_MACH= + UTESTBIN=openblas_utest .PHONY : all From 30efed14d1aa9e1fba887aeddac964b841dd4720 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 24 Jul 2019 15:26:09 +0200 Subject: [PATCH 26/43] Unset special make variables in ctest Makefile as well --- ctest/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ctest/Makefile b/ctest/Makefile index 569a5dda3..f562c9bb3 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -6,6 +6,8 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system override CFLAGS += -DADD$(BU) -DCBLAS +override TARGET_ARCH= +override TARGET_MACH= LIB = $(TOPDIR)/$(LIBNAME) From 7eecd8e39cfd3bf3f8eddc1154b8b2bfec19ea33 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sun, 28 Jul 2019 07:39:09 +0800 Subject: [PATCH 27/43] Add files via upload --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 334 ++++++++++++++++++++++- 1 file changed, 325 insertions(+), 9 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 082e62a7c..19e32ef2c 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -107,6 +107,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define A_PR1 512 #define B_PR1 160 +#define BROADCASTKERNEL /******************************************************************************************* * Macro definitions @@ -133,7 +134,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 A_PR1(AO) vmovups -12 * SIZE(BO), %ymm1 prefetcht0 B_PR1(BO) +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else vmovups -16 * SIZE(AO), %ymm0 +# endif prefetcht0 B_PR1+64(BO) vmovups -8 * SIZE(BO), %ymm2 prefetcht0 B_PR1+128(BO) @@ -143,17 +148,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 ,%ymm2 , %ymm8 vmulpd %ymm0 ,%ymm3 , %ymm12 prefetcht0 B_PR1+256(BO) +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vmulpd %ymm0 ,%ymm1 , %ymm5 vmulpd %ymm0 ,%ymm2 , %ymm9 vmulpd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vmulpd %ymm0 ,%ymm1 , %ymm6 vmulpd %ymm0 ,%ymm2 , %ymm10 addq $ 12*SIZE, BO vmulpd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vmulpd %ymm0 ,%ymm2 , %ymm11 @@ -165,23 +182,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x12_M1 prefetcht0 A_PR1(AO) +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else vmovups -16 * SIZE(AO), %ymm0 +# endif prefetcht0 B_PR1(BO) vfmadd231pd %ymm0 ,%ymm1 , %ymm4 prefetcht0 B_PR1+64(BO) vfmadd231pd %ymm0 ,%ymm2 , %ymm8 prefetcht0 B_PR1+128(BO) vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -192,21 +224,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x12_M2 +# if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +# else vmovups -12 * SIZE(AO), %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups 0 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -218,21 +266,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x12_E +# if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +# else vmovups -12 * SIZE(AO), %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 @@ -241,23 +305,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x12_SUB vmovups -12 * SIZE(BO), %ymm1 +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else vmovups -16 * SIZE(AO), %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vmovups -8 * SIZE(BO), %ymm2 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vmovups -4 * SIZE(BO), %ymm3 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 addq $ 12*SIZE, BO vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 4*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 @@ -289,27 +369,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if B_PR1 > 96 prefetcht0 192 + BUFFER1 #endif + +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 +#else vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 +#endif + #if B_PR1 > 160 prefetcht0 256 + BUFFER1 #endif + +#if defined BROADCASTKERNEL + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 +#endif + #if B_PR1 > 224 prefetcht0 320 + BUFFER1 #endif + +#ifndef BROADCASTKERNEL vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 +#endif + #if B_PR1 > 288 prefetcht0 384 + BUFFER1 #endif + +#ifndef BROADCASTKERNEL vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif + #if B_PR1 > 352 prefetcht0 448 + BUFFER1 #endif @@ -338,11 +444,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht1 56(%rax) prefetcht1 56(%rax,LDC) - vpermilpd $ 0x05 , %ymm9 , %ymm9 +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0 + vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1 + vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2 + vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm9, %ymm9 vpermilpd $ 0x05 , %ymm11, %ymm11 - vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 - vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 + vblendpd $ 0x0a, %ymm9, %ymm8, %ymm0 + vblendpd $ 0x05, %ymm9, %ymm8, %ymm1 vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 @@ -353,7 +469,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - +#endif leaq (%rax, LDC, 2), %rax leaq (%rax, LDC, 2), %rbp @@ -377,6 +493,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht1 56(%rbp) prefetcht1 56(%rbp,LDC) +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm14, %ymm12 , %ymm0 + vperm2f128 $ 0x20 , %ymm15, %ymm13 , %ymm1 + vperm2f128 $ 0x31 , %ymm14, %ymm12 , %ymm2 + vperm2f128 $ 0x31 , %ymm15, %ymm13 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else vpermilpd $ 0x05 , %ymm13, %ymm13 vpermilpd $ 0x05 , %ymm15, %ymm15 @@ -392,7 +518,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - +#endif leaq (%rax, LDC, 4), %rax leaq (%rbp, LDC, 4), %rbp @@ -693,19 +819,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_I vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vmovups -8 * SIZE(BO), %ymm2 vmulpd %ymm0 ,%ymm1 , %ymm4 vmulpd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm5 vmulpd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm6 vmulpd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vmulpd %ymm0 ,%ymm2 , %ymm11 @@ -715,19 +857,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_M1 prefetcht0 A_PR1(AO) +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif prefetcht0 B_PR1(BO) vfmadd231pd %ymm0 ,%ymm1 , %ymm4 prefetcht0 B_PR1+64(BO) vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -736,18 +893,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M2 +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else vmovups -12 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -4 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -757,18 +930,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_E +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else vmovups -12 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 addq $ 8*SIZE, BO @@ -776,19 +965,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_SUB vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vmovups -8 * SIZE(BO), %ymm2 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 addq $ 8*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 4*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -809,6 +1014,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 @@ -824,6 +1039,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif leaq (CO1, LDC, 2), %rax @@ -847,6 +1063,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 56(%rax) prefetcht0 56(%rax,LDC) +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0 + vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1 + vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2 + vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else vpermilpd $ 0x05 , %ymm9 , %ymm9 vpermilpd $ 0x05 , %ymm11, %ymm11 @@ -862,7 +1088,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - +#endif leaq (%rax, LDC, 2), %rax leaq (%rax, LDC, 2), %rbp @@ -1088,15 +1314,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I prefetcht0 A_PR1(AO) vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm6 addq $ 4*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 @@ -1104,29 +1346,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_M1 prefetcht0 A_PR1(AO) +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 .endm .macro KERNEL4x4_M2 +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else vmovups -12 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -8 * SIZE(BO), %ymm1 addq $ 8*SIZE, BO @@ -1134,30 +1407,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_E +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else vmovups -12 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 addq $ 4*SIZE, BO .endm .macro KERNEL4x4_SUB vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 addq $ 4*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 4*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 .endm @@ -1171,6 +1476,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 @@ -1186,6 +1501,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif leaq (CO1, LDC, 2), %rax From 2dfb804cb943ac12035fe51859d109daca76b4f4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Jul 2019 23:17:28 +0200 Subject: [PATCH 28/43] Replace vpermpd with vpermilpd in the Haswell DTRMM kernel to improve performance on AMD Zen (#2180) applying wjc404's improvement of the DGEMM kernel from #2186 --- kernel/x86_64/dtrmm_kernel_4x8_haswell.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/kernel/x86_64/dtrmm_kernel_4x8_haswell.c b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c index 651736b89..2acdc4615 100644 --- a/kernel/x86_64/dtrmm_kernel_4x8_haswell.c +++ b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c @@ -33,7 +33,7 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm4 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm8 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm5 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm9 \n\t" @@ -41,7 +41,7 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm6 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm10 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm7 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm11 \n\t" @@ -62,18 +62,16 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vmulpd %%ymm0 , %%ymm10, %%ymm10 \n\t" " vmulpd %%ymm0 , %%ymm11, %%ymm11 \n\t" - " vpermpd $0xb1 , %%ymm5 , %%ymm5 \n\t" - " vpermpd $0xb1 , %%ymm7 , %%ymm7 \n\t" + " vpermilpd $0x05 , %%ymm5 , %%ymm5 \n\t" + " vpermilpd $0x05 , %%ymm7 , %%ymm7 \n\t" " vblendpd $0x0a , %%ymm5 , %%ymm4 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm5 , %%ymm4 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm7 , %%ymm6 , %%ymm2 \n\t" " vblendpd $0x05 , %%ymm7 , %%ymm6 , %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t" @@ -85,18 +83,16 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vmovups %%ymm6 , (%7) \n\t" " vmovups %%ymm7 , (%8) \n\t" - " vpermpd $0xb1 , %%ymm9 , %%ymm9 \n\t" - " vpermpd $0xb1 , %%ymm11, %%ymm11 \n\t" + " vpermilpd $0x05 , %%ymm9 , %%ymm9 \n\t" + " vpermilpd $0x05 , %%ymm11, %%ymm11 \n\t" " vblendpd $0x0a , %%ymm9 , %%ymm8 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm9 , %%ymm8 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm11, %%ymm10, %%ymm2 \n\t" " vblendpd $0x05 , %%ymm11, %%ymm10, %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t" From 648491e1aa5cec7e8b8947d8ce47a825ceba705d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 Aug 2019 22:51:09 +0200 Subject: [PATCH 29/43] Autodetect Intel Ice Lake (as SKYLAKEX target) --- cpuid_x86.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 884d4b78a..141d6044e 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1211,7 +1211,7 @@ int get_cpuname(void){ return CPUTYPE_CORE2; } break; - case 1: + case 1: // family 6 exmodel 1 switch (model) { case 6: return CPUTYPE_CORE2; @@ -1228,7 +1228,7 @@ int get_cpuname(void){ return CPUTYPE_DUNNINGTON; } break; - case 2: + case 2: // family 6 exmodel 2 switch (model) { case 5: //Intel Core (Clarkdale) / Core (Arrandale) @@ -1257,7 +1257,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 3: + case 3: // family 6 exmodel 3 switch (model) { case 7: // Bay Trail @@ -1287,7 +1287,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 4: + case 4: // family 6 exmodel 4 switch (model) { case 5: case 6: @@ -1321,7 +1321,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 5: + case 5: // family 6 exmodel 5 switch (model) { case 6: //Broadwell @@ -1364,7 +1364,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 6: + case 6: // family 6 exmodel 6 switch (model) { case 6: // Cannon Lake if(support_avx512()) @@ -1376,7 +1376,20 @@ int get_cpuname(void){ else return CPUTYPE_NEHALEM; } - break; + break; + case 7: // family 6 exmodel 7 + switch (model) { + case 14: // Ice Lake + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } + break; case 9: case 8: switch (model) { From 3d36c4511693bfd7c117465a701c5ff1f19f8565 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 Aug 2019 22:52:35 +0200 Subject: [PATCH 30/43] Add CPUID identification of Intel Ice Lake --- driver/others/dynamic.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 045fc65b8..f1cd3c6e6 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -585,9 +585,27 @@ static gotoblas_t *get_coretype(void){ } } return NULL; + case 7: + if (model == 14) { + // Ice Lake + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); + return &gotoblas_HASWELL; + } + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } + return NULL; case 9: case 8: - if (model == 14 ) { // Kaby Lake + if (model == 14 ) { // Kaby Lake, Coffee Lake if(support_avx2()) return &gotoblas_HASWELL; if(support_avx()) { From acf6002ab242f98460845bb71db8fefdbdb26a1f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 3 Aug 2019 12:40:13 +0200 Subject: [PATCH 31/43] Replace most vpermpd calls in the Haswell DTRSM_RN kernel --- kernel/x86_64/dtrsm_kernel_RN_haswell.c | 36 +++++++++++-------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c index 9ab78fc8e..cb939e762 100644 --- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c +++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c @@ -132,7 +132,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "1: \n\t" " vmovups (%8,%1,4), %%ymm4 \n\t" // read a - " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm3 \n\t" // was vpermpd 0xb1 " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" @@ -143,7 +143,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t" " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1 - " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm3 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" @@ -160,7 +160,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" @@ -170,7 +170,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " addq $8, %1 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm11 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm15 \n\t" @@ -185,7 +185,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm9 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm13 \n\t" @@ -193,7 +193,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm11 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm15 \n\t" @@ -204,7 +204,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" @@ -212,42 +212,38 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm11 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm15 \n\t" "3: \n\t" - " vpermpd $0xb1 , %%ymm9 , %%ymm9 \n\t" - " vpermpd $0xb1 , %%ymm11, %%ymm11 \n\t" + " vpermilpd $0x05 , %%ymm9 , %%ymm9 \n\t" + " vpermilpd $0x05 , %%ymm11, %%ymm11 \n\t" " vblendpd $0x0a , %%ymm9 , %%ymm8 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm9 , %%ymm8 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm11, %%ymm10, %%ymm2 \n\t" " vblendpd $0x05 , %%ymm11, %%ymm10, %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm8 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm9 \n\t" " vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm10 \n\t" " vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm11 \n\t" - " vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" - " vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + " vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t" + " vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t" " vblendpd $0x0a , %%ymm13, %%ymm12, %%ymm0 \n\t" " vblendpd $0x05 , %%ymm13, %%ymm12, %%ymm1 \n\t" " vblendpd $0x0a , %%ymm15, %%ymm14, %%ymm2 \n\t" " vblendpd $0x05 , %%ymm15, %%ymm14, %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm12 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm13 \n\t" From 4e2f81cfa1f6dfa24912c3ff88470471b39b695e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Aug 2019 23:15:35 +0200 Subject: [PATCH 32/43] Provide more information on mmap/munmap failure for #2207 --- driver/others/memory.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index f67cb01f4..77d2b72fa 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2041,8 +2041,12 @@ static BLASULONG alloc_lock = 0UL; static void alloc_mmap_free(struct release_t *release){ +if (!release->address) return 0; + if (munmap(release -> address, BUFFER_SIZE)) { - printf("OpenBLAS : munmap failed\n"); + int errsv=errno; + perror("OpenBLAS : munmap failed:"); + printf("error code=%d,\trelease->address=%lx\n",errsv,release->address); } } @@ -2073,6 +2077,12 @@ static void *alloc_mmap(void *address){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif + } else { +#ifdef DEBUG + int errsv=errno; + perror("OpenBLAS : mmap failed:"); + printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); +#endif } #ifdef OS_LINUX From 1776ad82c01c0f9efeeda043eb02e10187084066 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 9 Aug 2019 00:08:11 +0200 Subject: [PATCH 33/43] Add files via upload --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 77d2b72fa..534d6d9fc 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2041,7 +2041,7 @@ static BLASULONG alloc_lock = 0UL; static void alloc_mmap_free(struct release_t *release){ -if (!release->address) return 0; +if (!release->address) return; if (munmap(release -> address, BUFFER_SIZE)) { int errsv=errno; From b7bbb02447ed612e380dc1ca6d6e7a26f48dc868 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Aug 2019 12:46:05 +0200 Subject: [PATCH 34/43] Silence two nuisance warnings from gcc --- cpuid_arm64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index a5e731d74..e8aa29813 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -94,7 +94,7 @@ int get_feature(char *search) if( p == NULL ) return 0; t = strtok(p," "); - while( t = strtok(NULL," ")) + while( (t = strtok(NULL," "))) { if (!strcmp(t, search)) { return(1); } } @@ -344,7 +344,7 @@ void get_features(void) if( p == NULL ) return; t = strtok(p," "); - while( t = strtok(NULL," ")) + while( (t = strtok(NULL," "))) { } From be147a9f28889d831019c6f860d501b2546e3771 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Aug 2019 16:24:39 +0200 Subject: [PATCH 35/43] Avoid adding a spurious dependency on the fortran runtime despite NOFORTRAN=1 for cases where a fortran compiler is present but not wanted (e.g. not fully functional) --- Makefile.system | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 16791bcc2..835c76e78 100644 --- a/Makefile.system +++ b/Makefile.system @@ -267,9 +267,10 @@ OBJCOPY = $(CROSS_SUFFIX)objcopy OBJCONV = $(CROSS_SUFFIX)objconv -# For detect fortran failed, only build BLAS. +# When fortran support was either not detected or actively deselected, only build BLAS. ifeq ($(NOFORTRAN), 1) NO_LAPACK = 1 +override FEXTRALIB = endif # From 7b6808b69ca706c724a4258e7cfb460b3c8c25a7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Aug 2019 23:28:13 +0200 Subject: [PATCH 36/43] Increment version to 0.3.8.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d7d9c2fce..74db77135 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 7.dev) +set(OpenBLAS_PATCH_VERSION 8.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 02d92039811af88acd7e2b3d9fe4726c9f1008f4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Aug 2019 23:28:47 +0200 Subject: [PATCH 37/43] Increment version to 0.3.8.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index a299588e0..c0941e488 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.7.dev +VERSION = 0.3.8.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 303869f5724bb86d722bc32f254a976625ea2046 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Aug 2019 23:31:36 +0200 Subject: [PATCH 38/43] Update with changes from 0.3.7 --- Changelog.txt | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 8df35d5c3..f160a4e13 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,46 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.7 +11-Aug 2019 + +common: + * having the gmake special variables TARGET_ARCH or TARGET_MACH + defined no longer causes build failures in ctest or utest + * defining NO_AFFINITY or USE_TLS to 0 in gmake builds no longer + has the same effect as setting them to 1 + * a new test program was added to allow checking the library for + thread safety + * a new option USE_LOCKING was added to ensure thread safety when + OpenBLAS itself is built without multithreading but will be + called from multiple threads. + * a build failure on Linux with glibc versions earlier than 2.5 + was fixed + * a runtime error with CPU enumeration (and NO_AFFINITY not set) + on glibc 2.6 was fixed + * NO_AFFINITY was added to the CMAKE options (and defaults to being + active on Linux, as in the gmake builds) + +x86_64: + * the build-time logic for detection of AVX512 availability in + the processor and compiler was fixed + * gmake builds on OSX now set the internal name of the library to + libopenblas.0.dylib (consistent with CMAKE) + * the Haswell DGEMM kernel received a significant speedup through + improved prefetch and load instructions + * performance of DGEMM, DTRMM, DTRSM and ZDOT on Zen/Zen2 was markedly + increased by avoiding vpermpd instructions + * the SKYLAKEX (AVX512) DGEMM helper functions have now been disabled + to fix remaining errors in DGEMM, DSYMM and DTRMM + +## POWER: + * added support for building on FreeBSD/powerpc64 and FreeBSD/ppc970 + * added optimized kernels for POWER9 SGEMM and STRMM + +## ARMV7: + * fixed the softfp implementations of xAMAX and IxAMAX + * removed the predefined -march= flags on both ARMV5 and ARMV6 as + they were appropriate for only a subset of platforms + ==================================================================== Version 0.3.6 29-Apr-2019 From aef9804089b0c968806a0fc3cfb0219359ce42b2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Aug 2019 10:19:10 +0200 Subject: [PATCH 39/43] Fix unwanted case-sensitivity in x86 LSAME for (AMD) processors without CMOV Problem was already noticed some years ago in #238, but back then the problem was only corrected in one of the #ifdef branches. Fixes #2214 --- kernel/x86/lsame.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86/lsame.S b/kernel/x86/lsame.S index 3ac7a7314..2a2ab2bb5 100644 --- a/kernel/x86/lsame.S +++ b/kernel/x86/lsame.S @@ -56,13 +56,13 @@ #ifndef HAVE_CMOV movl %eax, %ecx subl $32, %ecx - jle .L1 + jl .L1 movl %ecx, %eax .L1: movl %edx, %ecx subl $32, %ecx - jle .L2 + jl .L2 movl %ecx, %edx .L2: subl %eax, %edx From a1fce677435a79d3cb577086793556d87ff76552 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Aug 2019 22:29:48 +0200 Subject: [PATCH 40/43] Make the new DGEMM regression test properly depend on CBLAS and LAPACKE fixes #2215 --- utest/CMakeLists.txt | 5 +++++ utest/Makefile | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 4e647cadc..1e3051a8f 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -38,9 +38,14 @@ if (NOT NO_LAPACK) set(OpenBLAS_utest_src ${OpenBLAS_utest_src} test_potrs.c + ) +if (NOT NO_CBLAS AND NOT NO_LAPACKE) +set(OpenBLAS_utest_src + ${OpenBLAS_utest_src} test_kernel_regress.c ) endif() +endif() set(OpenBLAS_utest_bin openblas_utest) add_executable(${OpenBLAS_utest_bin} ${OpenBLAS_utest_src}) diff --git a/utest/Makefile b/utest/Makefile index cbe639cdb..8c7e6b9f8 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -1,6 +1,9 @@ UTEST_CHECK = 1 TOPDIR = .. +override TARGET_ARCH= +override TARGET_MACH= + UTESTBIN=openblas_utest .PHONY : all @@ -13,8 +16,12 @@ OBJS=utest_main.o test_amax.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o ifneq ($(NO_LAPACK), 1) OBJS += test_potrs.o +ifneq ($(NO_CBLAS), 1) +ifneq ($(NO_LAPACKE), 1) OBJS += test_kernel_regress.o endif +endif +endif #this does not work with OpenMP nor with native Windows or Android threads # FIXME TBD if this works on OSX, SunOS, POWER and zarch From 9ef96b32a6cc9a41908e832f2f713462bb94f40f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Aug 2019 22:09:12 +0200 Subject: [PATCH 41/43] Add multithreading support to the x86_64 zdot kernel (#2222) * Add multithreading support copied from the ThunderX2T99 kernel. For #2221 --- kernel/x86_64/zdot.c | 90 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 74 insertions(+), 16 deletions(-) diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index ef12569c8..48f855b0e 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -86,18 +86,26 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) #endif -OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) + +#if defined(SMP) +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, + BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, + void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif + + + +static void zdot_compute (BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,OPENBLAS_COMPLEX_FLOAT *result) { BLASLONG i; BLASLONG ix,iy; FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; - + if ( n <= 0 ) { -// CREAL(result) = 0.0 ; -// CIMAG(result) = 0.0 ; - OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); - return(result); + OPENBLAS_COMPLEX_FLOAT res=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); + *result=res; + return; } @@ -150,18 +158,68 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA } #if !defined(CONJ) - OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); -// CREAL(result) = dot[0] - dot[1]; -// CIMAG(result) = dot[2] + dot[3]; + OPENBLAS_COMPLEX_FLOAT res=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); #else - OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); -// CREAL(result) = dot[0] + dot[1]; -// CIMAG(result) = dot[2] - dot[3]; - + OPENBLAS_COMPLEX_FLOAT res=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); #endif - - return(result); - + *result=res; + return; } +#if defined(SMP) +static int zdot_thread_function(BLASLONG n, BLASLONG dummy0, +BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, +BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) +{ + zdot_compute(n, x, inc_x, y, inc_y, (void *)result); + return 0; +} +#endif + +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha; +#endif + OPENBLAS_COMPLEX_FLOAT zdot; + CREAL(zdot) = 0.0; + CIMAG(zdot) = 0.0; + +#if defined(SMP) + if (inc_x == 0 || inc_y == 0 || n <= 10000) + nthreads = 1; + else + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { + zdot_compute(n, x, inc_x, y, inc_y, &zdot); + } else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) * 2]; + OPENBLAS_COMPLEX_FLOAT *ptr; + +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_COMPLEX; +#else + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#endif + + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, + x, inc_x, y, inc_y, result, 0, + ( void *)zdot_thread_function, nthreads); + + ptr = (OPENBLAS_COMPLEX_FLOAT *)result; + for (i = 0; i < nthreads; i++) { + CREAL(zdot) = CREAL(zdot) + CREAL(*ptr); + CIMAG(zdot) = CIMAG(zdot) + CIMAG(*ptr); + ptr = (void *)(((char *)ptr) + sizeof(double) * 2); + } + } +#else + zdot_compute(n, x, inc_x, y, inc_y, &zdot); +#endif + + return zdot; +} From e3d846ab57eabadf5b933e8ca66d0b2c62e23e4c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 16 Aug 2019 08:58:10 +0200 Subject: [PATCH 42/43] Do not use -march=native with the PGI compiler --- cmake/system.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 1c2093efe..4f8011603 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -66,7 +66,7 @@ if (DEFINED TARGET) endif () # On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. -if (X86_64) +if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI") set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native") endif () From a95a5e52b8df842f0ec23c6d0ad9b299c1318ab4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 16 Aug 2019 09:00:11 +0200 Subject: [PATCH 43/43] Fix PGI compiler detection for getarch --- Makefile.system | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 6addbdad5..a54282f6c 100644 --- a/Makefile.system +++ b/Makefile.system @@ -142,9 +142,9 @@ endif endif -# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. +# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch. ifeq ($(ARCH), x86_64) -ifneq ($(C_COMPILER), PGI) +ifeq ($(findstring pgcc,$(HOSTCC)),) GETARCH_FLAGS += -march=native endif endif