From f03dd23e90a4e248903b4c2f66c3507da716c38d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Feb 2020 23:18:07 +0100 Subject: [PATCH 01/75] Increment version to 0.3.9.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7e5a1b551..951271717 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 8) +set(OpenBLAS_PATCH_VERSION 9.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 3bec250cf995c8f96e0849e582227589234effcd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Feb 2020 23:18:44 +0100 Subject: [PATCH 02/75] Increment version to 0.3.9.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 618034bb8..21b7e138a 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.8 +VERSION = 0.3.9.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 579be3aa9d0e196fc9cc91f6e1f2372e87638f78 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Feb 2020 23:28:04 +0100 Subject: [PATCH 03/75] Add configuration option for BUFFER_SIZE --- Makefile.rule | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Makefile.rule b/Makefile.rule index 21b7e138a..724a60ec4 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -97,6 +97,15 @@ VERSION = 0.3.9.dev # they need to wait for the preceding API calls to finish or risk data corruption. # NUM_PARALLEL = 2 +# When multithreading, OpenBLAS needs to use a memory buffer for communicating +# and collating results for individual subranges of the original matrix. Since +# the original GotoBLAS of the early 2000s, the default size of this buffer has +# been set at a value of 32<<20 (which is 32MB) on x86_64 , twice that on PPC. +# If you expect to handle large problem sizes (beyond about 30000x30000) uncomment +# this line and adjust the (32< Date: Sun, 9 Feb 2020 23:30:22 +0100 Subject: [PATCH 04/75] Make BUFFER_SIZE configurable --- common_x86_64.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/common_x86_64.h b/common_x86_64.h index c05998d58..fe5539abe 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -225,7 +225,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #endif #define HUGE_PAGESIZE ( 2 << 20) +#ifndef BUFFERSIZE #define BUFFER_SIZE (32 << 20) +#else +#define BUFFER_SIZE (32 << BUFFERSIZE) +#endif #define SEEK_ADDRESS From 7f0d523b42feb70e7b8ad299d8005d73f620f219 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Feb 2020 23:32:57 +0100 Subject: [PATCH 05/75] Make BUFFER_SIZE configurable --- cmake/system.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index 4f8011603..ce980a7b9 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -289,6 +289,10 @@ set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}") set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}") +if (BUFFERSIZE) +set(CCOMMON_OPT "${CCOMMON_OPT} -DBUFFERSIZE=${BUFFERSIZE}") +endif () + if (USE_SIMPLE_THREADED_LEVEL3) set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3") endif () From 754433f4208832d14119b398e8dd46fbc448a828 Mon Sep 17 00:00:00 2001 From: gxw Date: Mon, 10 Feb 2020 19:11:45 +0800 Subject: [PATCH 06/75] =?UTF-8?q?Avoid=20printing=20the=20following=20info?= =?UTF-8?q?rmation=20on=20mips=20and=20mips64=20when=20check=20msa:=20"unr?= =?UTF-8?q?ecognized=20command=20line=20option=20=E2=80=98-mmsa=E2=80=99"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- c_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_check b/c_check index fbd1838aa..555b2eccf 100644 --- a/c_check +++ b/c_check @@ -195,7 +195,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { print $tmpf "void main(void){ __asm__ volatile($code); }\n"; $args = "$msa_flags -o $tmpf.o $tmpf"; - my @cmd = ("$compiler_name $args"); + my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); system(@cmd) == 0; if ($? != 0) { $have_msa = 0; From 303bdb673b8ef7b9e2ccdbb331827e00a1293951 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 10 Feb 2020 19:17:32 +0100 Subject: [PATCH 07/75] Fix coretype detection for Intel extended models 6 and 7 affecting Goldmont, Cannon Lake, Ice Lake autodetection --- cpuid_x86.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 9e1c8e752..e29adecae 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -2006,6 +2006,38 @@ int get_coretype(void){ return CORE_NEHALEM; } break; + case 6: + if (model == 6) +#ifndef NO_AVX512 + return CORE_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; +#endif + break; + case 7: + if (model == 10) + return CORE_NEHALEM; + if (model == 14) +#ifndef NO_AVX512 + return CORE_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; +#endif + break; case 9: case 8: if (model == 14) { // Kaby Lake From 7e5cbb6f3554342151c522ad4ab20111bf48f5d3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 10 Feb 2020 21:17:39 +0100 Subject: [PATCH 08/75] Fix bad conditional syntax that caused spurious application of USE_TRMM --- kernel/CMakeLists.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index ad15b8f25..b3310e87e 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -121,8 +121,10 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) # Makefile.L3 set(USE_TRMM false) - - if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen" OR "${TARGET_CORE}" STREQUAL "SKYLAKEX" OR "${CORE}" STREQUAL "skylakex") + if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) ) + set(USE_TRMM true) + endif () + if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9)) set(USE_TRMM true) endif () From dff173e50e01d94e0741e4b4eaa1cf0aa01cf320 Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Tue, 11 Feb 2020 14:46:30 +1300 Subject: [PATCH 09/75] Fix typo in dynamic_zarch.c --- driver/others/dynamic_zarch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c index 1206bf870..896e65bb4 100644 --- a/driver/others/dynamic_zarch.c +++ b/driver/others/dynamic_zarch.c @@ -31,7 +31,7 @@ char* gotoblas_corename(void) { } // __builtin_cpu_is is not supported by zarch -static gotolabs_t* get_coretype(void) { +static gotoblas_t* get_coretype(void) { FILE* infile; char buffer[512], * p; From 5a6bba3061f19923eb9972378021e6498bf8e5ed Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Tue, 11 Feb 2020 15:07:33 +1300 Subject: [PATCH 10/75] Patch out instances of Z15 in dynamic_zarch.c There does not appear to be a Z15 kernel yet, causing link errors from the code. This patch fixes the issue. --- driver/others/dynamic_zarch.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c index 1206bf870..c7b82e4df 100644 --- a/driver/others/dynamic_zarch.c +++ b/driver/others/dynamic_zarch.c @@ -3,12 +3,12 @@ extern gotoblas_t gotoblas_Z13; extern gotoblas_t gotoblas_Z14; -extern gotoblas_t gotoblas_Z15; +//extern gotoblas_t gotoblas_Z15; //#if (!defined C_GCC) || (GCC_VERSION >= 60000) //extern gotoblas_t gotoblas_Z14; //#endif -#define NUM_CORETYPES 5 +#define NUM_CORETYPES 4 extern void openblas_warning(int verbose, const char* msg); @@ -16,14 +16,14 @@ static char* corename[] = { "unknown", "Z13", "Z14", - "Z15", +// "Z15", "ZARCH_GENERIC", }; char* gotoblas_corename(void) { if (gotoblas == &gotoblas_Z13) return corename[1]; if (gotoblas == &gotoblas_Z14) return corename[2]; - if (gotoblas == &gotoblas_Z15) return corename[3]; +// if (gotoblas == &gotoblas_Z15) return corename[3]; //#if (!defined C_GCC) || (GCC_VERSION >= 60000) // if (gotoblas == &gotoblas_POWER9) return corename[3]; //#endif @@ -78,7 +78,7 @@ static gotoblas_t* force_coretype(char* coretype) { { case 1: return (&gotoblas_Z13); case 2: return (&gotoblas_Z14); - case 3: return (&gotoblas_Z15); +// case 3: return (&gotoblas_Z15); //#if (!defined C_GCC) || (GCC_VERSION >= 60000) // case 3: return (&gotoblas_POWER9); //#endif From 7ea5e07d1cb59834428d982818b7cf565dcda4df Mon Sep 17 00:00:00 2001 From: Bart Oldeman Date: Wed, 12 Feb 2020 14:11:44 +0000 Subject: [PATCH 11/75] Fix inline asm in dscal: mark x, x1 as clobbered. Fixes #2408 The leaq instructions in dscal_kernel_inc_8 modify x and x1 so they must be declared as input/output constraints, otherwise the compiler may assume the corresponding registers are not modified. --- kernel/x86_64/dscal.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index d0d7801fd..e2436f789 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -136,10 +136,10 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_ "jnz 1b \n\t" : - "+r" (n) // 0 + "+r" (n), // 0 + "+r" (x), // 1 + "+r" (x1) // 2 : - "r" (x), // 1 - "r" (x1), // 2 "r" (alpha), // 3 "r" (inc_x), // 4 "r" (inc_x3) // 5 From dc345d84df90a54e6f416ab7f3604b645297df23 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Feb 2020 19:56:52 +0100 Subject: [PATCH 12/75] Fix syntax of endianness conditional and add gcc version check for workaround --- kernel/power/KERNEL.POWER8 | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index fb9452a35..ba9a99cd1 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -89,30 +89,52 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SMINKERNEL = ../arm/min.c #DMINKERNEL = ../arm/min.c # -ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(GCCVERSIONGTEQ9),1) ISAMAXKERNEL = isamax_power8.S else ISAMAXKERNEL = isamax.c endif +else +ISAMAXKERNEL = isamax.c +endif +# IDAMAXKERNEL = idamax.c -ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +# +ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(GCCVERSIONGTEQ9),1) ICAMAXKERNEL = icamax_power8.S else ICAMAXKERNEL = icamax.c endif +else +ICAMAXKERNEL = icamax.c +endif +# IZAMAXKERNEL = izamax.c # -ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(GCCVERSIONGTEQ9),1) ISAMINKERNEL = isamin_power8.S else ISAMINKERNEL = isamin.c endif +else +ISAMINKERNEL = isamin.c +endif +# IDAMINKERNEL = idamin.c -ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +# +ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(GCCVERSIONGTEQ9),1) ICAMINKERNEL = icamin_power8.S else ICAMINKERNEL = icamin.c endif +else +ICAMINKERNEL = icamin.c +endif +# IZAMINKERNEL = izamin.c # #ISMAXKERNEL = ../arm/imax.c @@ -128,11 +150,16 @@ ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c -ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(GCCVERSIONGTEQ9),1) CAXPYKERNEL = caxpy_power8.S else CAXPYKERNEL = caxpy.c endif +else +CAXPYKERNEL = caxpy.c +endif +# ZAXPYKERNEL = zaxpy.c # SCOPYKERNEL = scopy.c From 120d20731face6457aa9f234eeaecf45f51a0b51 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Feb 2020 19:58:42 +0100 Subject: [PATCH 13/75] Fix syntax of endianness conditional --- kernel/power/KERNEL.PPC440 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/power/KERNEL.PPC440 b/kernel/power/KERNEL.PPC440 index a0696b548..8d64d3fc1 100644 --- a/kernel/power/KERNEL.PPC440 +++ b/kernel/power/KERNEL.PPC440 @@ -15,7 +15,7 @@ ZASUMKERNEL = zasum_ppc440.S SAXPYKERNEL = axpy_ppc440.S DAXPYKERNEL = axpy_ppc440.S -ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") CAXPYKERNEL = ../arm/zaxpy.c ZAXPYKERNEL = ../arm/zaxpy.c else @@ -25,7 +25,7 @@ endif SDOTKERNEL = dot_ppc440.S DDOTKERNEL = dot_ppc440.S -ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") CDOTKERNEL = zdot_ppc440.S ZDOTKERNEL = zdot_ppc440.S else @@ -62,7 +62,7 @@ ZNRM2KERNEL = znrm2_ppc440.S SROTKERNEL = rot_ppc440.S DROTKERNEL = rot_ppc440.S -ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") CROTKERNEL = zrot_ppc440.S ZROTKERNEL = zrot_ppc440.S else @@ -132,7 +132,7 @@ ZTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S -ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +ifeq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = ../arm/gemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c From 0544cbc806466a42a01b8235c87938ed93c221d6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Feb 2020 20:00:29 +0100 Subject: [PATCH 14/75] Fix syntax of endianness conditional --- kernel/power/KERNEL.PPC970 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/power/KERNEL.PPC970 b/kernel/power/KERNEL.PPC970 index de30977de..cc1f215de 100644 --- a/kernel/power/KERNEL.PPC970 +++ b/kernel/power/KERNEL.PPC970 @@ -1,4 +1,4 @@ -ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +ifeq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") SGEMMKERNEL = gemm_kernel.S SGEMMINCOPY = SGEMMITCOPY = @@ -30,7 +30,7 @@ DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +ifeq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") CGEMMKERNEL = zgemm_kernel.S CGEMMINCOPY = CGEMMITCOPY = @@ -72,7 +72,7 @@ ZTRSMKERNEL_LT = ztrsm_kernel_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_RT.S -ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +ifeq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") STRSMKERNEL_LN = trsm_kernel_LN.S STRSMKERNEL_LT = trsm_kernel_LT.S STRSMKERNEL_RN = trsm_kernel_LT.S From 7c162b8a2104d316e05883f77c98bc795ccc7f81 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Feb 2020 23:56:57 +0100 Subject: [PATCH 15/75] Update isamax_power8.S --- kernel/power/isamax_power8.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/power/isamax_power8.S b/kernel/power/isamax_power8.S index fa5433333..7938779c9 100644 --- a/kernel/power/isamax_power8.S +++ b/kernel/power/isamax_power8.S @@ -11,7 +11,8 @@ #include "common.h" PROLOGUE - + +isamax_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l From de40d47edf316a26f5607a242f8bda69c9f06caf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Feb 2020 23:57:48 +0100 Subject: [PATCH 16/75] Update isamin_power8.S --- kernel/power/isamin_power8.S | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/power/isamin_power8.S b/kernel/power/isamin_power8.S index c9b6acb85..432c0e776 100644 --- a/kernel/power/isamin_power8.S +++ b/kernel/power/isamin_power8.S @@ -11,6 +11,7 @@ PROLOGUE +isamin_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l From 8eefa530cd141be1b38fe12ebc3831ba9c285c15 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Feb 2020 23:59:50 +0100 Subject: [PATCH 17/75] Update isamax_power8.S --- kernel/power/isamax_power8.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/power/isamax_power8.S b/kernel/power/isamax_power8.S index 7938779c9..ec1c283f3 100644 --- a/kernel/power/isamax_power8.S +++ b/kernel/power/isamax_power8.S @@ -12,7 +12,9 @@ PROLOGUE +#if _CALL_ELF == 2 isamax_k: +#endif .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l From 5ba3699f419185898d368a4b7618d396dff419ba Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Feb 2020 00:00:32 +0100 Subject: [PATCH 18/75] Update isamin_power8.S --- kernel/power/isamin_power8.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/power/isamin_power8.S b/kernel/power/isamin_power8.S index 432c0e776..1978af880 100644 --- a/kernel/power/isamin_power8.S +++ b/kernel/power/isamin_power8.S @@ -11,7 +11,9 @@ PROLOGUE +#if _CALL_ELF ==2 isamin_k: +#endif .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l From 0e05ea9baca5d2c95a063a579367522d1ada811f Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Thu, 13 Feb 2020 14:51:55 +0100 Subject: [PATCH 19/75] Add CMake related files to .gitignore. --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index e9d08ca7e..6803a919e 100644 --- a/.gitignore +++ b/.gitignore @@ -87,4 +87,5 @@ build.* *.swp benchmark/*.goto benchmark/smallscaling - +CMakeCache.txt +CMakeFiles/* From 486c35c5dc1f3fa3bae87e0256342068070c9ed6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Feb 2020 18:38:43 +0100 Subject: [PATCH 20/75] Update icamin_power8.S --- kernel/power/icamin_power8.S | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/power/icamin_power8.S b/kernel/power/icamin_power8.S index f2993e83e..e4469eb64 100644 --- a/kernel/power/icamin_power8.S +++ b/kernel/power/icamin_power8.S @@ -10,7 +10,9 @@ #include "common.h" PROLOGUE - +#if _CALL_ELF ==2 +icamin_k: +#endif .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l From 92ca92a46c98f09121844199fa853ed0428ab521 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Feb 2020 21:24:54 +0100 Subject: [PATCH 21/75] Update caxpy_power8.S --- kernel/power/caxpy_power8.S | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/power/caxpy_power8.S b/kernel/power/caxpy_power8.S index 294a1d24d..dbbd55196 100644 --- a/kernel/power/caxpy_power8.S +++ b/kernel/power/caxpy_power8.S @@ -12,6 +12,13 @@ PROLOGUE +#if _CALL_ELF ==2 +#ifdef CONJ +caxpyc_k: +#else +caxpyc: +#endif +#endif .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l From cafdd999b81a08b1a003d53e52e53ee04b1d1842 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Feb 2020 22:44:09 +0100 Subject: [PATCH 22/75] Update caxpy_power8.S --- kernel/power/caxpy_power8.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/caxpy_power8.S b/kernel/power/caxpy_power8.S index dbbd55196..10a372832 100644 --- a/kernel/power/caxpy_power8.S +++ b/kernel/power/caxpy_power8.S @@ -16,7 +16,7 @@ #ifdef CONJ caxpyc_k: #else -caxpyc: +caxpy_k: #endif #endif .LCF0: From eb285b4d20a46ce8db7f6234c9d4d804d81be41e Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Fri, 14 Feb 2020 10:45:31 +0100 Subject: [PATCH 23/75] Make ctest verbose for drone builder. --- .drone.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.drone.yml b/.drone.yml index 779912954..696c5a99d 100644 --- a/.drone.yml +++ b/.drone.yml @@ -92,7 +92,7 @@ steps: - mkdir build && cd build - cmake $CMAKE_FLAGS .. - make -j - - ctest + - ctest -V --- kind: pipeline @@ -116,7 +116,7 @@ steps: - mkdir build && cd build - cmake $CMAKE_FLAGS .. - make -j - - ctest + - ctest -V --- kind: pipeline @@ -140,4 +140,4 @@ steps: - mkdir build && cd build - cmake $CMAKE_FLAGS .. - make -j - - ctest + - ctest -V From c222b25b81672b0af5b4c550915d097be3d6dd88 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Feb 2020 19:29:14 +0100 Subject: [PATCH 24/75] Correct generation of GETRF files by the CMAKE build fixes #2396 --- lapack/CMakeLists.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index d48a270ab..e21a9aabb 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -4,7 +4,6 @@ include_directories(${PROJECT_BINARY_DIR}) set(LAPACK_SOURCES - getrf/getrf_single.c potrf/potrf_U_single.c potrf/potrf_L_single.c lauum/lauum_U_single.c @@ -45,6 +44,10 @@ GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" false "" "" false 3) GenerateNamedObjects("laswp/generic/laswp_k_4.c" "" "laswp_plus" false "" "" false 3) GenerateNamedObjects("laswp/generic/laswp_k_4.c" "MINUS" "laswp_minus" false "" "" false 3) +foreach (float_type ${FLOAT_TYPES}) +GenerateNamedObjects("getrf/getrf_single.c" "UNIT" "getrf_single" false "" "" false ${float_type}) +endforeach () + # dynamic_arch laswp needs arch specific code ? #foreach(TARGET_CORE ${DYNAMIC_CORE}) # set(TSUFFIX "_${TARGET_CORE}") @@ -81,7 +84,7 @@ if (USE_THREAD) ) foreach (float_type ${FLOAT_TYPES}) - GenerateNamedObjects("${GETRF_SRC}" "" "getrf_parallel" false "" "" false ${float_type}) + GenerateNamedObjects("${GETRF_SRC}" "UNIT" "getrf_parallel" false "" "" false ${float_type}) endforeach() GenerateNamedObjects("${PARALLEL_SOURCES}") From 46e4b12946ddbed0eb6bb42092fc0f8b21561a8d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Feb 2020 23:06:51 +0100 Subject: [PATCH 25/75] Update KERNEL.POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index fb9452a35..ab03e63ea 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -86,7 +86,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SMAXKERNEL = ../arm/max.c #DMAXKERNEL = ../arm/max.c # -#SMINKERNEL = ../arm/min.c +SMINKERNEL = min_ppc440.S #DMINKERNEL = ../arm/min.c # ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) From d92bd5be246f9bbddb7a5406526f992e5c4ac72e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Feb 2020 23:07:50 +0100 Subject: [PATCH 26/75] Update KERNEL.POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index ab03e63ea..da765a26e 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -86,7 +86,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SMAXKERNEL = ../arm/max.c #DMAXKERNEL = ../arm/max.c # -SMINKERNEL = min_ppc440.S +#SMINKERNEL = ../arm/min.c #DMINKERNEL = ../arm/min.c # ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) From e3368cbf1881cdbe65ccc20803f5596bab2b4c08 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sun, 16 Feb 2020 22:58:00 +0800 Subject: [PATCH 27/75] AVX512 STRMM kernel --- kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c | 536 +++++++++++++++---- 1 file changed, 418 insertions(+), 118 deletions(-) diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c index 6ca822b91..ec7570179 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c @@ -1,8 +1,152 @@ -/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */ -/* r10 to assist prefetch, r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */ +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */ +/* r10 to assist prefetch, r11 = m_counter, r12 = k << 4(const), r13 = k_todo, r14 = b_head_pos(const), r15 = %1 + 3r12 */ #include "common.h" #include +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + #define BACKWARDS 1 +#else + #define BACKWARDS 0 +#endif +#define REC_POINTER_1(ptr) "salq $2,%%r13; subq %%r13,"#ptr"; sarq $2,%%r13;" +#define REC_POINTER_2(ptr) "salq $3,%%r13; subq %%r13,"#ptr"; sarq $3,%%r13;" +#define REC_POINTER_4(ptr) "salq $4,%%r13; subq %%r13,"#ptr"; sarq $4,%%r13;" +#define REC_POINTER_8(ptr) "salq $5,%%r13; subq %%r13,"#ptr"; sarq $5,%%r13;" +#define REC_POINTER_16(ptr) "salq $6,%%r13; subq %%r13,"#ptr"; sarq $6,%%r13;" +#define INC_POINTER_1(ptr) "sarq $2,%%r12; addq %%r12,"#ptr"; salq $2,%%r12;" +#define INC_POINTER_2(ptr) "sarq $1,%%r12; addq %%r12,"#ptr"; salq $1,%%r12;" +#define INC_POINTER_4(ptr) "addq %%r12,"#ptr";" +#define INC_POINTER_8(ptr) "leaq ("#ptr",%%r12,2),"#ptr";" +#define INC_POINTER_16(ptr) "leaq ("#ptr",%%r12,4),"#ptr";" +#define SET_POINTER(ptr,dim) REC_POINTER_##dim(ptr) INC_POINTER_##dim(ptr) +#define SET_PB_1 SET_POINTER(%1,1) +#define SET_PB_2 SET_POINTER(%1,2) +#define SET_PB_4 SET_POINTER(%1,4) +#define SET_PB_8 SET_POINTER(%1,4) +#define SET_PB_12 SET_POINTER(%1,4) +#define SET_PB_16 SET_POINTER(%1,4) +#define SET_PB_20 SET_POINTER(%1,4) +#define SET_PB_24 SET_POINTER(%1,4) +#ifdef TRMMKERNEL + #if BACKWARDS == 1 + #define START_SET_PAPB(mdim,ndim) SET_POINTER(%0,mdim) "movq %%r14,%1;" SET_PB_##ndim "leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;" + #define END_SET_PA(mdim) "" + #else + #define START_SET_PAPB(mdim,ndim) "movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;" + #define END_SET_PA(mdim) SET_POINTER(%0,mdim) + #endif +#else + #define START_SET_PAPB(mdim,ndim) "movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;" + #define END_SET_PA(mdim) "" +#endif +#define RECOVER_PA(mdim) REC_POINTER_##mdim(%0) + +#if defined(TRMMKERNEL) && !defined(LEFT) + #if BACKWARDS == 1 + #define KERNEL_HEAD_C_n8(mdim) \ + KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 "subq $4,%4; addq $64,%%r15;" + #define KERNEL_HEAD_C_n12(mdim) KERNEL_HEAD_C_n8(mdim)\ + KERNEL_k1m##mdim##n8 KERNEL_k1m##mdim##n8 KERNEL_k1m##mdim##n8 KERNEL_k1m##mdim##n8 "subq $4,%4; addq $64,%%r15;" + #define KERNEL_HEAD_C_n16(mdim) KERNEL_HEAD_C_n12(mdim)\ + KERNEL_k1m##mdim##n12 KERNEL_k1m##mdim##n12 KERNEL_k1m##mdim##n12 KERNEL_k1m##mdim##n12 "subq $4,%4; addq $64,%%r15;" + #define KERNEL_HEAD_C_n20(mdim) KERNEL_HEAD_C_n16(mdim)\ + KERNEL_k1m##mdim##n16 KERNEL_k1m##mdim##n16 KERNEL_k1m##mdim##n16 KERNEL_k1m##mdim##n16 "subq $4,%4;" + #define KERNEL_HEAD_C_n24(mdim) KERNEL_HEAD_C_n20(mdim)\ + KERNEL_k1m##mdim##n20 KERNEL_k1m##mdim##n20 KERNEL_k1m##mdim##n20 KERNEL_k1m##mdim##n20 "subq $4,%4;" + #define KERNEL_HEAD_R_n4(mdim) "subq $12,%4; addq $64,%%r15; addq $"#mdim"*48,%0;" + #define KERNEL_HEAD_R_n8(mdim) KERNEL_HEAD_R_n4(mdim)\ + kernel_k1m##mdim##n4(%%r15) kernel_k1m##mdim##n4(%%r15) kernel_k1m##mdim##n4(%%r15) kernel_k1m##mdim##n4(%%r15) "subq $4,%4;" + #define KERNEL_HEAD_R_n12(mdim) KERNEL_HEAD_R_n8(mdim)\ + kernel_k1m##mdim##n8(%%r15) kernel_k1m##mdim##n8(%%r15) kernel_k1m##mdim##n8(%%r15) kernel_k1m##mdim##n8(%%r15) "subq $4,%4;" + #define KERNEL_TAIL_C_n8(mdim) "" + #define KERNEL_TAIL_C_n12(mdim) "" + #define KERNEL_TAIL_C_n16(mdim) "" + #define KERNEL_TAIL_C_n20(mdim) "" + #define KERNEL_TAIL_C_n24(mdim) "" + #define KERNEL_TAIL_R_n4(mdim) "" + #define KERNEL_TAIL_R_n8(mdim) "" + #define KERNEL_TAIL_R_n12(mdim) "" + #else + #define KERNEL_HEAD_C_n8(mdim) "" + #define KERNEL_HEAD_C_n12(mdim) "" + #define KERNEL_HEAD_C_n16(mdim) "" + #define KERNEL_HEAD_C_n20(mdim) "" + #define KERNEL_HEAD_C_n24(mdim) "" + #define KERNEL_HEAD_R_n4(mdim) "" + #define KERNEL_HEAD_R_n8(mdim) "" + #define KERNEL_HEAD_R_n12(mdim) "" + #define end_kernel_k4_ncx1(k_0,k_1,k_2,k_3,n1,mdim) \ + end_load_a_k1m##mdim(k_0) end_acc_nc##n1##_k1m##mdim(k_0)\ + end_load_a_k1m##mdim(k_1) end_acc_nc##n1##_k1m##mdim(k_1)\ + end_load_a_k1m##mdim(k_2) end_acc_nc##n1##_k1m##mdim(k_2)\ + end_load_a_k1m##mdim(k_3) end_acc_nc##n1##_k1m##mdim(k_3) + #define end_kernel_k4_ncx2(k_0,k_1,k_2,k_3,n1,n2,mdim) \ + end_load_a_k1m##mdim(k_0) end_acc_nc##n1##_k1m##mdim(k_0) end_acc_nc##n2##_k1m##mdim(k_0)\ + end_load_a_k1m##mdim(k_1) end_acc_nc##n1##_k1m##mdim(k_1) end_acc_nc##n2##_k1m##mdim(k_1)\ + end_load_a_k1m##mdim(k_2) end_acc_nc##n1##_k1m##mdim(k_2) end_acc_nc##n2##_k1m##mdim(k_2)\ + end_load_a_k1m##mdim(k_3) end_acc_nc##n1##_k1m##mdim(k_3) end_acc_nc##n2##_k1m##mdim(k_3) + #define end_kernel_k4_ncx3(k_0,k_1,k_2,k_3,n1,n2,n3,mdim) \ + end_load_a_k1m##mdim(k_0) end_acc_nc##n1##_k1m##mdim(k_0) end_acc_nc##n2##_k1m##mdim(k_0) end_acc_nc##n3##_k1m##mdim(k_0)\ + end_load_a_k1m##mdim(k_1) end_acc_nc##n1##_k1m##mdim(k_1) end_acc_nc##n2##_k1m##mdim(k_1) end_acc_nc##n3##_k1m##mdim(k_1)\ + end_load_a_k1m##mdim(k_2) end_acc_nc##n1##_k1m##mdim(k_2) end_acc_nc##n2##_k1m##mdim(k_2) end_acc_nc##n3##_k1m##mdim(k_2)\ + end_load_a_k1m##mdim(k_3) end_acc_nc##n1##_k1m##mdim(k_3) end_acc_nc##n2##_k1m##mdim(k_3) end_acc_nc##n3##_k1m##mdim(k_3) + #define end_kernel_k4_ncx4(k_0,k_1,k_2,k_3,n1,n2,n3,n4,mdim) \ + end_load_a_k1m##mdim(k_0) end_acc_nc##n1##_k1m##mdim(k_0) end_acc_nc##n2##_k1m##mdim(k_0) end_acc_nc##n3##_k1m##mdim(k_0) end_acc_nc##n4##_k1m##mdim(k_0)\ + end_load_a_k1m##mdim(k_1) end_acc_nc##n1##_k1m##mdim(k_1) end_acc_nc##n2##_k1m##mdim(k_1) end_acc_nc##n3##_k1m##mdim(k_1) end_acc_nc##n4##_k1m##mdim(k_1)\ + end_load_a_k1m##mdim(k_2) end_acc_nc##n1##_k1m##mdim(k_2) end_acc_nc##n2##_k1m##mdim(k_2) end_acc_nc##n3##_k1m##mdim(k_2) end_acc_nc##n4##_k1m##mdim(k_2)\ + end_load_a_k1m##mdim(k_3) end_acc_nc##n1##_k1m##mdim(k_3) end_acc_nc##n2##_k1m##mdim(k_3) end_acc_nc##n3##_k1m##mdim(k_3) end_acc_nc##n4##_k1m##mdim(k_3) + #define end_kernel_k4_ncx5(k_0,k_1,k_2,k_3,n1,n2,n3,n4,n5,mdim) \ + end_load_a_k1m##mdim(k_0) end_acc_nc##n1##_k1m##mdim(k_0) end_acc_nc##n2##_k1m##mdim(k_0)\ + end_acc_nc##n3##_k1m##mdim(k_0) end_acc_nc##n4##_k1m##mdim(k_0) end_acc_nc##n5##_k1m##mdim(k_0)\ + end_load_a_k1m##mdim(k_1) end_acc_nc##n1##_k1m##mdim(k_1) end_acc_nc##n2##_k1m##mdim(k_1)\ + end_acc_nc##n3##_k1m##mdim(k_1) end_acc_nc##n4##_k1m##mdim(k_1) end_acc_nc##n5##_k1m##mdim(k_1)\ + end_load_a_k1m##mdim(k_2) end_acc_nc##n1##_k1m##mdim(k_2) end_acc_nc##n2##_k1m##mdim(k_2)\ + end_acc_nc##n3##_k1m##mdim(k_2) end_acc_nc##n4##_k1m##mdim(k_2) end_acc_nc##n5##_k1m##mdim(k_2)\ + end_load_a_k1m##mdim(k_3) end_acc_nc##n1##_k1m##mdim(k_3) end_acc_nc##n2##_k1m##mdim(k_3)\ + end_acc_nc##n3##_k1m##mdim(k_3) end_acc_nc##n4##_k1m##mdim(k_3) end_acc_nc##n5##_k1m##mdim(k_3) + #define KERNEL_TAIL_C_n8(mdim) end_kernel_k4_ncx1(0,1,2,3,2,mdim) + #define KERNEL_TAIL_C_n12(mdim) \ + end_kernel_k4_ncx2(0,1,2,3,2,3,mdim) end_kernel_k4_ncx1(4,5,6,7,3,mdim) + #define KERNEL_TAIL_C_n16(mdim) \ + end_kernel_k4_ncx3(0,1,2,3,2,3,4,mdim) end_kernel_k4_ncx2(4,5,6,7,3,4,mdim) end_kernel_k4_ncx1(8,9,10,11,4,mdim) + #define KERNEL_TAIL_C_n20(mdim) \ + end_kernel_k4_ncx4(0,1,2,3,2,3,4,5,mdim) end_kernel_k4_ncx3(4,5,6,7,3,4,5,mdim)\ + end_kernel_k4_ncx2(8,9,10,11,4,5,mdim) end_kernel_k4_ncx1(12,13,14,15,5,mdim) + #define KERNEL_TAIL_C_n24(mdim) \ + end_kernel_k4_ncx5(0,1,2,3,2,3,4,5,6,mdim) end_kernel_k4_ncx4(4,5,6,7,3,4,5,6,mdim) end_kernel_k4_ncx3(8,9,10,11,4,5,6,mdim)\ + end_kernel_k4_ncx2(12,13,14,15,5,6,mdim) end_kernel_k4_ncx1(16,17,18,19,6,mdim) + #define KERNEL_TAIL_R_n4(mdim) \ + end_kernel_k4_ncx1(0,1,2,3,4,mdim) end_kernel_k4_ncx1(4,5,6,7,4,mdim) end_kernel_k4_ncx1(8,9,10,11,4,mdim) + #define KERNEL_TAIL_R_n8(mdim) \ + end_kernel_k4_ncx2(0,1,2,3,4,5,mdim) end_kernel_k4_ncx2(4,5,6,7,4,5,mdim) end_kernel_k4_ncx2(8,9,10,11,4,5,mdim) end_kernel_k4_ncx1(12,13,14,15,5,mdim) + #define KERNEL_TAIL_R_n12(mdim) \ + end_kernel_k4_ncx3(0,1,2,3,4,5,6,mdim) end_kernel_k4_ncx3(4,5,6,7,4,5,6,mdim) end_kernel_k4_ncx3(8,9,10,11,4,5,6,mdim)\ + end_kernel_k4_ncx2(12,13,14,15,5,6,mdim) end_kernel_k4_ncx1(16,17,18,19,6,mdim) + #endif +#else + #define KERNEL_HEAD_C_n8(mdim) "" + #define KERNEL_HEAD_C_n12(mdim) "" + #define KERNEL_HEAD_C_n16(mdim) "" + #define KERNEL_HEAD_C_n20(mdim) "" + #define KERNEL_HEAD_C_n24(mdim) "" + #define KERNEL_HEAD_R_n4(mdim) "" + #define KERNEL_HEAD_R_n8(mdim) "" + #define KERNEL_HEAD_R_n12(mdim) "" + #define KERNEL_TAIL_C_n8(mdim) "" + #define KERNEL_TAIL_C_n12(mdim) "" + #define KERNEL_TAIL_C_n16(mdim) "" + #define KERNEL_TAIL_C_n20(mdim) "" + #define KERNEL_TAIL_C_n24(mdim) "" + #define KERNEL_TAIL_R_n4(mdim) "" + #define KERNEL_TAIL_R_n8(mdim) "" + #define KERNEL_TAIL_R_n12(mdim) "" +#endif +#define KERNEL_HEAD_C_n1(mdim) "" +#define KERNEL_HEAD_C_n2(mdim) "" +#define KERNEL_HEAD_C_n4(mdim) "" +#define KERNEL_TAIL_C_n1(mdim) "" +#define KERNEL_TAIL_C_n2(mdim) "" +#define KERNEL_TAIL_C_n4(mdim) "" /* m = 16 */ /* zmm8-zmm31 for accumulators, zmm1-zmm7 for temporary use, zmm0 for alpha */ #define KERNEL_k1m16n1 \ @@ -15,9 +159,10 @@ #define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $8,%1;" #define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "vbroadcastsd 8(%1),%%zmm7; vfmadd231ps %%zmm4,%%zmm7,%%zmm10; vfmadd231ps %%zmm5,%%zmm7,%%zmm11;" #define KERNEL_k1m16n4 KERNEL_h_k1m16n4 "addq $16,%1;" -#define unit_kernel_k1m16n4(c1,c2,c3,c4, ...) \ - "vbroadcastsd ("#__VA_ARGS__"),%%zmm6; vfmadd231ps %%zmm4,%%zmm6,"#c1"; vfmadd231ps %%zmm5,%%zmm6,"#c2";"\ - "vbroadcastsd 8("#__VA_ARGS__"),%%zmm7; vfmadd231ps %%zmm4,%%zmm7,"#c3"; vfmadd231ps %%zmm5,%%zmm7,"#c4";" +#define unit_gen_kernel_k1m16n4(c1,c2,c3,c4,k_no,...) \ + "vbroadcastsd "#k_no"*16 ("#__VA_ARGS__"),%%zmm6; vfmadd231ps %%zmm4,%%zmm6,"#c1"; vfmadd231ps %%zmm5,%%zmm6,"#c2";"\ + "vbroadcastsd "#k_no"*16+8("#__VA_ARGS__"),%%zmm7; vfmadd231ps %%zmm4,%%zmm7,"#c3"; vfmadd231ps %%zmm5,%%zmm7,"#c4";" +#define unit_kernel_k1m16n4(c1,c2,c3,c4, ...) unit_gen_kernel_k1m16n4(c1,c2,c3,c4,0,__VA_ARGS__) #define KERNEL_h_k1m16n8 KERNEL_h_k1m16n4 unit_kernel_k1m16n4(%%zmm12,%%zmm13,%%zmm14,%%zmm15,%1,%%r12,1) #define KERNEL_k1m16n8 KERNEL_h_k1m16n8 "addq $16,%1;" #define KERNEL_h_k1m16n12 KERNEL_h_k1m16n8 unit_kernel_k1m16n4(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%1,%%r12,2) @@ -28,6 +173,12 @@ #define KERNEL_k1m16n20 KERNEL_h_k1m16n20 "addq $16,%%r15;" #define KERNEL_h_k1m16n24 KERNEL_h_k1m16n20 unit_kernel_k1m16n4(%%zmm28,%%zmm29,%%zmm30,%%zmm31,%%r15,%%r12,2) #define KERNEL_k1m16n24 KERNEL_h_k1m16n24 "addq $16,%%r15;" +#define end_load_a_k1m16(k_no) "vmovsldup "#k_no"*64(%0),%%zmm4; vmovshdup "#k_no"*64(%0),%%zmm5;" +#define end_acc_nc2_k1m16(k_no) unit_gen_kernel_k1m16n4(%%zmm12,%%zmm13,%%zmm14,%%zmm15,k_no,%1,%%r12,1) +#define end_acc_nc3_k1m16(k_no) unit_gen_kernel_k1m16n4(%%zmm16,%%zmm17,%%zmm18,%%zmm19,k_no,%1,%%r12,2) +#define end_acc_nc4_k1m16(k_no) unit_gen_kernel_k1m16n4(%%zmm20,%%zmm21,%%zmm22,%%zmm23,k_no,%%r15) +#define end_acc_nc5_k1m16(k_no) unit_gen_kernel_k1m16n4(%%zmm24,%%zmm25,%%zmm26,%%zmm27,k_no,%%r15,%%r12,1) +#define end_acc_nc6_k1m16(k_no) unit_gen_kernel_k1m16n4(%%zmm28,%%zmm29,%%zmm30,%%zmm31,k_no,%%r15,%%r12,2) #define INIT_m16n1 "vpxorq %%zmm8,%%zmm8,%%zmm8;" #define INIT_m16n2 INIT_m16n1 "vpxorq %%zmm9,%%zmm9,%%zmm9;" #define INIT_m16n4 INIT_m16n2 "vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;" @@ -38,11 +189,19 @@ #define INIT_m16n16 INIT_m16n12 unit_init_m16n4(%%zmm20,%%zmm21,%%zmm22,%%zmm23) #define INIT_m16n20 INIT_m16n16 unit_init_m16n4(%%zmm24,%%zmm25,%%zmm26,%%zmm27) #define INIT_m16n24 INIT_m16n20 unit_init_m16n4(%%zmm28,%%zmm29,%%zmm30,%%zmm31) -#define SAVE_h_m16n1 "vfmadd213ps (%2),%%zmm0,%%zmm8; vmovups %%zmm8,(%2);" -#define unit_save_m16n2(c1,c2) \ +#ifdef TRMMKERNEL + #define SAVE_h_m16n1 "vmulps %%zmm8,%%zmm0,%%zmm8; vmovups %%zmm8,(%2);" + #define unit_save_m16n2(c1,c2) \ + "vunpcklps "#c2","#c1",%%zmm6; vunpckhps "#c2","#c1",%%zmm7; vunpcklpd %%zmm7,%%zmm6,%%zmm4; vunpckhpd %%zmm7,%%zmm6,%%zmm5;"\ + "vmulps %%zmm4,%%zmm0,%%zmm4; vmulps %%zmm5,%%zmm0,%%zmm5;"\ + "vmovups %%zmm4,(%5); vmovups %%zmm5,(%5,%3,1); leaq (%5,%3,2),%5;" +#else + #define SAVE_h_m16n1 "vfmadd213ps (%2),%%zmm0,%%zmm8; vmovups %%zmm8,(%2);" + #define unit_save_m16n2(c1,c2) \ "vunpcklps "#c2","#c1",%%zmm6; vunpckhps "#c2","#c1",%%zmm7; vunpcklpd %%zmm7,%%zmm6,%%zmm4; vunpckhpd %%zmm7,%%zmm6,%%zmm5;"\ "vfmadd213ps (%5),%%zmm0,%%zmm4; vfmadd213ps (%5,%3,1),%%zmm0,%%zmm5;"\ "vmovups %%zmm4,(%5); vmovups %%zmm5,(%5,%3,1); leaq (%5,%3,2),%5;" +#endif #define SAVE_h_m16n2 "movq %2,%5;" unit_save_m16n2(%%zmm8,%%zmm9) #define SAVE_h_m16n4 SAVE_h_m16n2 unit_save_m16n2(%%zmm10,%%zmm11) #define SAVE_h_m16n8 SAVE_h_m16n4 unit_save_m16n2(%%zmm12,%%zmm13) unit_save_m16n2(%%zmm14,%%zmm15) @@ -52,8 +211,9 @@ #define SAVE_h_m16n24 SAVE_h_m16n20 unit_save_m16n2(%%zmm28,%%zmm29) unit_save_m16n2(%%zmm30,%%zmm31) #define SAVE_m16(ndim) SAVE_h_m16n##ndim "addq $64,%2;" #define COMPUTE_m16(ndim) \ - INIT_m16n##ndim\ - "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15; movq %2,%5; xorq %%r10,%%r10;"\ + INIT_m16n##ndim START_SET_PAPB(16,ndim)\ + "movq %%r13,%4; movq %2,%5; xorq %%r10,%%r10;"\ + KERNEL_HEAD_C_n##ndim(16)\ "cmpq $16,%4; jb "#ndim"016162f;"\ #ndim"016161:\n\t"\ "cmpq $126,%%r10; movq $126,%%r10; cmoveq %3,%%r10;"\ @@ -72,28 +232,41 @@ KERNEL_k1m16n##ndim\ "leaq (%5,%3,2),%5; decq %4; jnz "#ndim"016163b;"\ #ndim"016164:\n\t"\ + KERNEL_TAIL_C_n##ndim(16)\ "prefetcht0 (%%r14); prefetcht0 64(%%r14);"\ - SAVE_m16(ndim) + SAVE_m16(ndim) END_SET_PA(16) /* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */ -#define KERNEL_k1m8n1(b_addr) \ +#define kernel_k1m8n1(b_addr) \ "vmovups (%0),%%ymm1; addq $32,%0;"\ "vbroadcastss ("#b_addr"),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\ "addq $4,"#b_addr";" -#define KERNEL_h_k1m8n2(b_addr) \ +#define kernel_h_k1m8n2(b_addr) \ "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ "vbroadcastsd ("#b_addr"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;" -#define KERNEL_k1m8n2(b_addr) KERNEL_h_k1m8n2(b_addr) "addq $8,"#b_addr";" -#define KERNEL_h_k1m8n4(b_addr) \ - KERNEL_h_k1m8n2(b_addr) "vbroadcastsd 8("#b_addr"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;" -#define KERNEL_k1m8n4(b_addr) KERNEL_h_k1m8n4(b_addr) "addq $16,"#b_addr";" -#define unit_kernel_k1m8n4(c1,c2,c3,c4,...) \ - "vbroadcastsd ("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\ - "vbroadcastsd 8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";" -#define KERNEL_h_k1m8n8(b_addr) KERNEL_h_k1m8n4(b_addr) unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,b_addr,%%r12,1) -#define KERNEL_k1m8n8(b_addr) KERNEL_h_k1m8n8(b_addr) "addq $16,"#b_addr";" -#define KERNEL_h_k1m8n12(b_addr) KERNEL_h_k1m8n8(b_addr) unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,b_addr,%%r12,2) -#define KERNEL_k1m8n12(b_addr) KERNEL_h_k1m8n12(b_addr) "addq $16,"#b_addr";" +#define kernel_k1m8n2(b_addr) kernel_h_k1m8n2(b_addr) "addq $8,"#b_addr";" +#define kernel_h_k1m8n4(b_addr) \ + kernel_h_k1m8n2(b_addr) "vbroadcastsd 8("#b_addr"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;" +#define kernel_k1m8n4(b_addr) kernel_h_k1m8n4(b_addr) "addq $16,"#b_addr";" +#define unit_gen_kernel_k1m8n4(c1,c2,c3,c4,k_no,...) \ + "vbroadcastsd "#k_no"*16 ("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\ + "vbroadcastsd "#k_no"*16+8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";" +#define unit_kernel_k1m8n4(c1,c2,c3,c4,...) unit_gen_kernel_k1m8n4(c1,c2,c3,c4,0,__VA_ARGS__) +#define kernel_h_k1m8n8(b_addr) kernel_h_k1m8n4(b_addr) unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,b_addr,%%r12,1) +#define kernel_k1m8n8(b_addr) kernel_h_k1m8n8(b_addr) "addq $16,"#b_addr";" +#define kernel_h_k1m8n12(b_addr) kernel_h_k1m8n8(b_addr) unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,b_addr,%%r12,2) +#define kernel_k1m8n12(b_addr) kernel_h_k1m8n12(b_addr) "addq $16,"#b_addr";" +#define KERNEL_k1m8n1 kernel_k1m8n1(%1) +#define KERNEL_k1m8n2 kernel_k1m8n2(%1) +#define KERNEL_k1m8n4 kernel_k1m8n4(%1) +#define KERNEL_k1m8n8 kernel_k1m8n8(%1) +#define KERNEL_k1m8n12 kernel_k1m8n12(%1) +#define end_load_a_k1m8(k_no) "vmovsldup "#k_no"*32(%0),%%ymm1; vmovshdup "#k_no"*32(%0),%%ymm2;" +#define end_acc_nc2_k1m8(k_no) unit_gen_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,k_no,%1,%%r12,1) +#define end_acc_nc3_k1m8(k_no) unit_gen_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,k_no,%1,%%r12,2) +#define end_acc_nc4_k1m8(k_no) unit_gen_kernel_k1m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,k_no,%%r15) +#define end_acc_nc5_k1m8(k_no) unit_gen_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,k_no,%%r15,%%r12,1) +#define end_acc_nc6_k1m8(k_no) unit_gen_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,k_no,%%r15,%%r12,2) #define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;" #define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;" #define INIT_m8n4 INIT_m8n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;" @@ -101,12 +274,21 @@ "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" #define INIT_m8n8 INIT_m8n4 unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11) #define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15) -#define SAVE_L_m8n1 "vfmadd213ps (%2),%%ymm0,%%ymm4; vmovups %%ymm4,(%2);" -#define unit_save_m8n2(c1,c2) \ +#ifdef TRMMKERNEL + #define SAVE_L_m8n1 "vmulps %%ymm4,%%ymm0,%%ymm4; vmovups %%ymm4,(%2);" + #define unit_save_m8n2(c1,c2) \ "vunpcklps "#c2","#c1",%%ymm2; vunpckhps "#c2","#c1",%%ymm3;"\ - "vunpcklpd %%ymm3,%%ymm2,%%ymm1;vfmadd213ps (%5), %%ymm0,%%ymm1;vmovups %%ymm1,(%5);"\ - "vunpckhpd %%ymm3,%%ymm2,%%ymm1;vfmadd213ps (%5,%3,1),%%ymm0,%%ymm1;vmovups %%ymm1,(%5,%3,1);"\ + "vunpcklpd %%ymm3,%%ymm2,%%ymm1; vmulps %%ymm1,%%ymm0,%%ymm1; vmovups %%ymm1,(%5);"\ + "vunpckhpd %%ymm3,%%ymm2,%%ymm1; vmulps %%ymm1,%%ymm0,%%ymm1; vmovups %%ymm1,(%5,%3,1);"\ "leaq (%5,%3,2),%5;" +#else + #define SAVE_L_m8n1 "vfmadd213ps (%2),%%ymm0,%%ymm4; vmovups %%ymm4,(%2);" + #define unit_save_m8n2(c1,c2) \ + "vunpcklps "#c2","#c1",%%ymm2; vunpckhps "#c2","#c1",%%ymm3;"\ + "vunpcklpd %%ymm3,%%ymm2,%%ymm1; vfmadd213ps (%5), %%ymm0,%%ymm1; vmovups %%ymm1,(%5);"\ + "vunpckhpd %%ymm3,%%ymm2,%%ymm1; vfmadd213ps (%5,%3,1),%%ymm0,%%ymm1; vmovups %%ymm1,(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#endif #define SAVE_L_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5) #define SAVE_L_m8n4 SAVE_L_m8n2 unit_save_m8n2(%%ymm6,%%ymm7) #define SAVE_L_m8n8 SAVE_L_m8n4 unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11) @@ -115,53 +297,68 @@ #define SAVE_R_m8n8 SAVE_R_m8n4 unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11) #define SAVE_R_m8n12 SAVE_R_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15) #define COMPUTE_L_m8(ndim,sim) \ - INIT_m8n##ndim\ - "movq %%r13,%4; movq %%r14,%1;"\ - #ndim""#sim"882:\n\t"\ + INIT_m8n##ndim START_SET_PAPB(8,ndim)\ + "movq %%r13,%4;"\ + KERNEL_HEAD_C_n##ndim(8)\ "testq %4,%4; jz "#ndim""#sim"883f;"\ - KERNEL_k1m8n##ndim(%1)\ - "decq %4; jmp "#ndim""#sim"882b;"\ + #ndim""#sim"882:\n\t"\ + kernel_k1m8n##ndim(%1)\ + "decq %4; jnz "#ndim""#sim"882b;"\ #ndim""#sim"883:\n\t"\ + KERNEL_TAIL_C_n##ndim(8)\ SAVE_L_m8n##ndim "addq $32,%2;" #define COMPUTE_R_m8(ndim,sim) \ - "subq %%r12,%0; subq %%r12,%0;"\ - INIT_m8n##ndim\ - "movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\ - #ndim""#sim"882:\n\t"\ + INIT_m8n##ndim RECOVER_PA(8)\ + "movq %%r13,%4;"\ + KERNEL_HEAD_R_n##ndim(8)\ "testq %4,%4; jz "#ndim""#sim"883f;"\ - KERNEL_k1m8n##ndim(%%r15)\ - "decq %4; jmp "#ndim""#sim"882b;"\ + #ndim""#sim"882:\n\t"\ + kernel_k1m8n##ndim(%%r15)\ + "decq %4; jnz "#ndim""#sim"882b;"\ #ndim""#sim"883:\n\t"\ - SAVE_R_m8n##ndim -#define COMPUTE_m8_n1 COMPUTE_L_m8(1,33833) -#define COMPUTE_m8_n2 COMPUTE_L_m8(2,33833) -#define COMPUTE_m8_n4 COMPUTE_L_m8(4,33833) -#define COMPUTE_m8_n8 COMPUTE_L_m8(8,33833) -#define COMPUTE_m8_n12 COMPUTE_L_m8(12,33833) + KERNEL_TAIL_R_n##ndim(8)\ + SAVE_R_m8n##ndim END_SET_PA(8) +#define COMPUTE_m8_n1 COMPUTE_L_m8(1,33833) END_SET_PA(8) +#define COMPUTE_m8_n2 COMPUTE_L_m8(2,33833) END_SET_PA(8) +#define COMPUTE_m8_n4 COMPUTE_L_m8(4,33833) END_SET_PA(8) +#define COMPUTE_m8_n8 COMPUTE_L_m8(8,33833) END_SET_PA(8) +#define COMPUTE_m8_n12 COMPUTE_L_m8(12,33833) END_SET_PA(8) #define COMPUTE_m8_n16 COMPUTE_L_m8(12,33733) COMPUTE_R_m8(4,33933) #define COMPUTE_m8_n20 COMPUTE_L_m8(12,33633) COMPUTE_R_m8(8,33933) #define COMPUTE_m8_n24 COMPUTE_L_m8(12,33533) COMPUTE_R_m8(12,33933) #define COMPUTE_m8(ndim) COMPUTE_m8_n##ndim /* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */ -#define KERNEL_k1m4n1(b_addr) \ +#define kernel_k1m4n1(b_addr) \ "vmovups (%0),%%xmm1; addq $16,%0;"\ "vbroadcastss ("#b_addr"),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ "addq $4,"#b_addr";" -#define KERNEL_h_k1m4n2(b_addr) \ +#define kernel_h_k1m4n2(b_addr) \ "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\ "vmovddup ("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;" -#define KERNEL_k1m4n2(b_addr) KERNEL_h_k1m4n2(b_addr) "addq $8,"#b_addr";" -#define KERNEL_h_k1m4n4(b_addr) \ - KERNEL_h_k1m4n2(b_addr) "vmovddup 8("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;" -#define KERNEL_k1m4n4(b_addr) KERNEL_h_k1m4n4(b_addr) "addq $16,"#b_addr";" -#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \ - "vmovddup ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\ - "vmovddup 8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";" -#define KERNEL_h_k1m4n8(b_addr) KERNEL_h_k1m4n4(b_addr) unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,b_addr,%%r12,1) -#define KERNEL_k1m4n8(b_addr) KERNEL_h_k1m4n8(b_addr) "addq $16,"#b_addr";" -#define KERNEL_h_k1m4n12(b_addr) KERNEL_h_k1m4n8(b_addr) unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,b_addr,%%r12,2) -#define KERNEL_k1m4n12(b_addr) KERNEL_h_k1m4n12(b_addr) "addq $16,"#b_addr";" +#define kernel_k1m4n2(b_addr) kernel_h_k1m4n2(b_addr) "addq $8,"#b_addr";" +#define kernel_h_k1m4n4(b_addr) \ + kernel_h_k1m4n2(b_addr) "vmovddup 8("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;" +#define kernel_k1m4n4(b_addr) kernel_h_k1m4n4(b_addr) "addq $16,"#b_addr";" +#define unit_gen_kernel_k1m4n4(c1,c2,c3,c4,k_no,...) \ + "vmovddup "#k_no"*16 ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\ + "vmovddup "#k_no"*16+8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";" +#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) unit_gen_kernel_k1m4n4(c1,c2,c3,c4,0,__VA_ARGS__) +#define kernel_h_k1m4n8(b_addr) kernel_h_k1m4n4(b_addr) unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,b_addr,%%r12,1) +#define kernel_k1m4n8(b_addr) kernel_h_k1m4n8(b_addr) "addq $16,"#b_addr";" +#define kernel_h_k1m4n12(b_addr) kernel_h_k1m4n8(b_addr) unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,b_addr,%%r12,2) +#define kernel_k1m4n12(b_addr) kernel_h_k1m4n12(b_addr) "addq $16,"#b_addr";" +#define KERNEL_k1m4n1 kernel_k1m4n1(%1) +#define KERNEL_k1m4n2 kernel_k1m4n2(%1) +#define KERNEL_k1m4n4 kernel_k1m4n4(%1) +#define KERNEL_k1m4n8 kernel_k1m4n8(%1) +#define KERNEL_k1m4n12 kernel_k1m4n12(%1) +#define end_load_a_k1m4(k_no) "vmovsldup "#k_no"*16(%0),%%xmm1; vmovshdup "#k_no"*16(%0),%%xmm2;" +#define end_acc_nc2_k1m4(k_no) unit_gen_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,k_no,%1,%%r12,1) +#define end_acc_nc3_k1m4(k_no) unit_gen_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,k_no,%1,%%r12,2) +#define end_acc_nc4_k1m4(k_no) unit_gen_kernel_k1m4n4(%%xmm4,%%xmm5,%%xmm6,%%xmm7,k_no,%%r15) +#define end_acc_nc5_k1m4(k_no) unit_gen_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,k_no,%%r15,%%r12,1) +#define end_acc_nc6_k1m4(k_no) unit_gen_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,k_no,%%r15,%%r12,2) #define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" #define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" #define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;" @@ -169,12 +366,21 @@ "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" #define INIT_m4n8 INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11) #define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15) -#define SAVE_L_m4n1 "vfmadd213ps (%2),%%xmm0,%%xmm4; vmovups %%xmm4,(%2);" -#define unit_save_m4n2(c1,c2) \ +#ifdef TRMMKERNEL + #define SAVE_L_m4n1 "vmulps %%xmm4,%%xmm0,%%xmm4; vmovups %%xmm4,(%2);" + #define unit_save_m4n2(c1,c2) \ + "vunpcklps "#c2","#c1",%%xmm2; vunpckhps "#c2","#c1",%%xmm3;"\ + "vunpcklpd %%xmm3,%%xmm2,%%xmm1;vmulps %%xmm1,%%xmm0,%%xmm1;vmovups %%xmm1,(%5);"\ + "vunpckhpd %%xmm3,%%xmm2,%%xmm1;vmulps %%xmm1,%%xmm0,%%xmm1;vmovups %%xmm1,(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#else + #define SAVE_L_m4n1 "vfmadd213ps (%2),%%xmm0,%%xmm4; vmovups %%xmm4,(%2);" + #define unit_save_m4n2(c1,c2) \ "vunpcklps "#c2","#c1",%%xmm2; vunpckhps "#c2","#c1",%%xmm3;"\ "vunpcklpd %%xmm3,%%xmm2,%%xmm1;vfmadd213ps (%5), %%xmm0,%%xmm1;vmovups %%xmm1,(%5);"\ "vunpckhpd %%xmm3,%%xmm2,%%xmm1;vfmadd213ps (%5,%3,1),%%xmm0,%%xmm1;vmovups %%xmm1,(%5,%3,1);"\ "leaq (%5,%3,2),%5;" +#endif #define SAVE_L_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5) #define SAVE_L_m4n4 SAVE_L_m4n2 unit_save_m4n2(%%xmm6,%%xmm7) #define SAVE_L_m4n8 SAVE_L_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11) @@ -183,29 +389,32 @@ #define SAVE_R_m4n8 SAVE_R_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11) #define SAVE_R_m4n12 SAVE_R_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15) #define COMPUTE_L_m4(ndim,sim) \ - INIT_m4n##ndim\ - "movq %%r13,%4; movq %%r14,%1;"\ - #ndim""#sim"442:\n\t"\ + INIT_m4n##ndim START_SET_PAPB(4,ndim)\ + "movq %%r13,%4;"\ + KERNEL_HEAD_C_n##ndim(4)\ "testq %4,%4; jz "#ndim""#sim"443f;"\ - KERNEL_k1m4n##ndim(%1)\ - "decq %4; jmp "#ndim""#sim"442b;"\ + #ndim""#sim"442:\n\t"\ + kernel_k1m4n##ndim(%1)\ + "decq %4; jnz "#ndim""#sim"442b;"\ #ndim""#sim"443:\n\t"\ + KERNEL_TAIL_C_n##ndim(4)\ SAVE_L_m4n##ndim "addq $16,%2;" #define COMPUTE_R_m4(ndim,sim) \ - "subq %%r12,%0;"\ - INIT_m4n##ndim\ - "movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\ - #ndim""#sim"442:\n\t"\ + INIT_m4n##ndim RECOVER_PA(4)\ + "movq %%r13,%4;"\ + KERNEL_HEAD_R_n##ndim(4)\ "testq %4,%4; jz "#ndim""#sim"443f;"\ - KERNEL_k1m4n##ndim(%%r15)\ - "decq %4; jmp "#ndim""#sim"442b;"\ + #ndim""#sim"442:\n\t"\ + kernel_k1m4n##ndim(%%r15)\ + "decq %4; jnz "#ndim""#sim"442b;"\ #ndim""#sim"443:\n\t"\ - SAVE_R_m4n##ndim -#define COMPUTE_m4_n1 COMPUTE_L_m4(1,55855) -#define COMPUTE_m4_n2 COMPUTE_L_m4(2,55855) -#define COMPUTE_m4_n4 COMPUTE_L_m4(4,55855) -#define COMPUTE_m4_n8 COMPUTE_L_m4(8,55855) -#define COMPUTE_m4_n12 COMPUTE_L_m4(12,55855) + KERNEL_TAIL_R_n##ndim(4)\ + SAVE_R_m4n##ndim END_SET_PA(4) +#define COMPUTE_m4_n1 COMPUTE_L_m4(1,55855) END_SET_PA(4) +#define COMPUTE_m4_n2 COMPUTE_L_m4(2,55855) END_SET_PA(4) +#define COMPUTE_m4_n4 COMPUTE_L_m4(4,55855) END_SET_PA(4) +#define COMPUTE_m4_n8 COMPUTE_L_m4(8,55855) END_SET_PA(4) +#define COMPUTE_m4_n12 COMPUTE_L_m4(12,55855) END_SET_PA(4) #define COMPUTE_m4_n16 COMPUTE_L_m4(12,55755) COMPUTE_R_m4(4,55955) #define COMPUTE_m4_n20 COMPUTE_L_m4(12,55655) COMPUTE_R_m4(8,55955) #define COMPUTE_m4_n24 COMPUTE_L_m4(12,55555) COMPUTE_R_m4(12,55955) @@ -217,40 +426,60 @@ "vmovsd (%0),%%xmm1; addq $8,%0;"\ "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ "addq $4,%1;" -#define SAVE_h_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" #define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" #define KERNEL_k1m2n2 \ "vmovsd (%0),%%xmm1; addq $8,%0;"\ "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ "vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\ "addq $8,%1;" -#define SAVE_h_m2n2 SAVE_h_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);" +#ifdef TRMMKERNEL + #define SAVE_h_m2n1 "vmulps %%xmm4,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" + #define SAVE_h_m2n2 SAVE_h_m2n1 "vmulps %%xmm5,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);" +#else + #define SAVE_h_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" + #define SAVE_h_m2n2 SAVE_h_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);" +#endif #define INIT_m2n4 INIT_m2n2 #define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;" #define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;" #define INIT_m2n16 INIT_m2n12 "vpxor %%xmm10,%%xmm10,%%xmm10; vpxor %%xmm11,%%xmm11,%%xmm11;" #define INIT_m2n20 INIT_m2n16 "vpxor %%xmm12,%%xmm12,%%xmm12; vpxor %%xmm13,%%xmm13,%%xmm13;" #define INIT_m2n24 INIT_m2n20 "vpxor %%xmm14,%%xmm14,%%xmm14; vpxor %%xmm15,%%xmm15,%%xmm15;" +#define unit_gen_kernel_k1m2n4(c1,c2,k_no,...) \ + "vmovups "#k_no"*16("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";" #define KERNEL_h_k1m2n4 \ - "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2; addq $8,%0;"\ - "vmovups (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;" + "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2; addq $8,%0;" unit_gen_kernel_k1m2n4(%%xmm4,%%xmm5,0,%1) #define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;" -#define KERNEL_h_k1m2n8 KERNEL_h_k1m2n4 "vmovups (%1,%%r12,1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;" +#define KERNEL_h_k1m2n8 KERNEL_h_k1m2n4 unit_gen_kernel_k1m2n4(%%xmm6,%%xmm7,0,%1,%%r12,1) #define KERNEL_k1m2n8 KERNEL_h_k1m2n8 "addq $16,%1;" -#define KERNEL_k1m2n12 KERNEL_h_k1m2n8 \ - "vmovups (%1,%%r12,2),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm8; vfmadd231ps %%xmm2,%%xmm3,%%xmm9; addq $16,%1;" -#define KERNEL_h_k1m2n16 KERNEL_k1m2n12 "vmovups (%%r15),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm10; vfmadd231ps %%xmm2,%%xmm3,%%xmm11;" +#define KERNEL_k1m2n12 KERNEL_h_k1m2n8 unit_gen_kernel_k1m2n4(%%xmm8,%%xmm9,0,%1,%%r12,2) "addq $16,%1;" +#define KERNEL_h_k1m2n16 KERNEL_k1m2n12 unit_gen_kernel_k1m2n4(%%xmm10,%%xmm11,0,%%r15) #define KERNEL_k1m2n16 KERNEL_h_k1m2n16 "addq $16,%%r15;" -#define KERNEL_h_k1m2n20 KERNEL_h_k1m2n16 "vmovups (%%r15,%%r12,1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm12; vfmadd231ps %%xmm2,%%xmm3,%%xmm13;" +#define KERNEL_h_k1m2n20 KERNEL_h_k1m2n16 unit_gen_kernel_k1m2n4(%%xmm12,%%xmm13,0,%%r15,%%r12,1) #define KERNEL_k1m2n20 KERNEL_h_k1m2n20 "addq $16,%%r15;" -#define KERNEL_h_k1m2n24 KERNEL_h_k1m2n20 "vmovups (%%r15,%%r12,2),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm14; vfmadd231ps %%xmm2,%%xmm3,%%xmm15;" +#define KERNEL_h_k1m2n24 KERNEL_h_k1m2n20 unit_gen_kernel_k1m2n4(%%xmm14,%%xmm15,0,%%r15,%%r12,2) #define KERNEL_k1m2n24 KERNEL_h_k1m2n24 "addq $16,%%r15;" -#define unit_save_m2n4(c1,c2) \ +#define end_load_a_k1m2(k_no) "vbroadcastss "#k_no"*8(%0),%%xmm1; vbroadcastss "#k_no"*8+4(%0),%%xmm2;" +#define end_acc_nc2_k1m2(k_no) unit_gen_kernel_k1m2n4(%%xmm6,%%xmm7,k_no,%1,%%r12,1) +#define end_acc_nc3_k1m2(k_no) unit_gen_kernel_k1m2n4(%%xmm8,%%xmm9,k_no,%1,%%r12,2) +#define end_acc_nc4_k1m2(k_no) unit_gen_kernel_k1m2n4(%%xmm10,%%xmm11,k_no,%%r15) +#define end_acc_nc5_k1m2(k_no) unit_gen_kernel_k1m2n4(%%xmm12,%%xmm13,k_no,%%r15,%%r12,1) +#define end_acc_nc6_k1m2(k_no) unit_gen_kernel_k1m2n4(%%xmm14,%%xmm15,k_no,%%r15,%%r12,2) +#ifdef TRMMKERNEL + #define unit_save_m2n4(c1,c2) \ + "vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\ + "vmulps %%xmm1,%%xmm0,%%xmm1; vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1);"\ + "leaq (%5,%3,2),%5;"\ + "vmulps %%xmm2,%%xmm0,%%xmm2; vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#else + #define unit_save_m2n4(c1,c2) \ "vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\ "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1; vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1);"\ "leaq (%5,%3,2),%5;"\ "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2; vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1);"\ "leaq (%5,%3,2),%5;" +#endif #define SAVE_h_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5) #define SAVE_h_m2n8 SAVE_h_m2n4 unit_save_m2n4(%%xmm6,%%xmm7) #define SAVE_h_m2n12 SAVE_h_m2n8 unit_save_m2n4(%%xmm8,%%xmm9) @@ -259,13 +488,15 @@ #define SAVE_h_m2n24 SAVE_h_m2n20 unit_save_m2n4(%%xmm14,%%xmm15) #define SAVE_m2(ndim) SAVE_h_m2n##ndim "addq $8,%2;" #define COMPUTE_m2(ndim) \ - INIT_m2n##ndim\ - "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"\ + INIT_m2n##ndim START_SET_PAPB(2,ndim)\ + "movq %%r13,%4;"\ + KERNEL_HEAD_C_n##ndim(2)\ "testq %4,%4; jz "#ndim"002022f;"\ #ndim"002021:\n\t"\ KERNEL_k1m2n##ndim "decq %4; jnz "#ndim"002021b;"\ #ndim"002022:\n\t"\ - SAVE_m2(ndim) + KERNEL_TAIL_C_n##ndim(2)\ + SAVE_m2(ndim) END_SET_PA(2) /* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */ #define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" @@ -273,15 +504,25 @@ "vmovss (%1),%%xmm3; addq $4,%1;"\ "vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\ "addq $4,%0;" -#define SAVE_h_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);" +#ifdef TRMMKERNEL + #define SAVE_h_m1n1 "vmulss %%xmm4,%%xmm0,%%xmm4; vmovss %%xmm4,(%2);" +#else + #define SAVE_h_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);" +#endif #define INIT_m1n2 INIT_m1n1 #define KERNEL_k1m1n2 \ "vmovsd (%1),%%xmm3; addq $8,%1;"\ "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ "addq $4,%0;" -#define SAVE_h_m1n2 \ +#ifdef TRMMKERNEL + #define SAVE_h_m1n2 \ + "vmulps %%xmm4,%%xmm0,%%xmm4;"\ + "vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);" +#else + #define SAVE_h_m1n2 \ "vmovss (%2),%%xmm3; vinsertps $16,(%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\ "vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);" +#endif #define INIT_m1n4 INIT_m1n2 #define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;" #define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;" @@ -300,12 +541,25 @@ #define KERNEL_k1m1n20 KERNEL_h_k1m1n20 "addq $16,%%r15;" #define KERNEL_h_k1m1n24 KERNEL_h_k1m1n20 "vfmadd231ps (%%r15,%%r12,2),%%xmm1,%%xmm9;" #define KERNEL_k1m1n24 KERNEL_h_k1m1n24 "addq $16,%%r15;" -#define unit_save_m1n4(c1) \ +#define end_load_a_k1m1(k_no) "vbroadcastss "#k_no"*4(%0),%%xmm1;" +#define end_acc_nc2_k1m1(k_no) "vfmadd231ps "#k_no"*16(%1,%%r12,1),%%xmm1,%%xmm5;" +#define end_acc_nc3_k1m1(k_no) "vfmadd231ps "#k_no"*16(%1,%%r12,2),%%xmm1,%%xmm6;" +#define end_acc_nc4_k1m1(k_no) "vfmadd231ps "#k_no"*16(%%r15),%%xmm1,%%xmm7;" +#define end_acc_nc5_k1m1(k_no) "vfmadd231ps "#k_no"*16(%%r15,%%r12,1),%%xmm1,%%xmm8;" +#define end_acc_nc6_k1m1(k_no) "vfmadd231ps "#k_no"*16(%%r15,%%r12,2),%%xmm1,%%xmm9;" +#ifdef TRMMKERNEL + #define unit_save_m1n4(c1) \ + "vmulps "#c1",%%xmm0,"#c1"; vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\ + "vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\ + "vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;" +#else + #define unit_save_m1n4(c1) \ "vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\ "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ "vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\ "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ "vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;" +#endif #define SAVE_h_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4) #define SAVE_h_m1n8 SAVE_h_m1n4 unit_save_m1n4(%%xmm5) #define SAVE_h_m1n12 SAVE_h_m1n8 unit_save_m1n4(%%xmm6) @@ -314,58 +568,102 @@ #define SAVE_h_m1n24 SAVE_h_m1n20 unit_save_m1n4(%%xmm9) #define SAVE_m1(ndim) SAVE_h_m1n##ndim "addq $4,%2;" #define COMPUTE_m1(ndim) \ - INIT_m1n##ndim\ - "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"\ + INIT_m1n##ndim START_SET_PAPB(1,ndim)\ + "movq %%r13,%4;"\ + KERNEL_HEAD_C_n##ndim(1)\ "testq %4,%4; jz "#ndim"001012f;"\ #ndim"001011:\n\t"\ KERNEL_k1m1n##ndim "decq %4; jnz "#ndim"001011b;"\ #ndim"001012:\n\t"\ - SAVE_m1(ndim) + KERNEL_TAIL_C_n##ndim(1)\ + SAVE_m1(ndim) END_SET_PA(1) -/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 = "+r"(K), %5 = "+r"(ctemp) */ -/* %6 = "+r"(next_b), %7 = "m"(ALPHA), %8 = "m"(M) */ -/* r11 = m_counter, r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */ +/* %7 = "m"(ALPHA), %8 = "m"(M), %9 = "m"(K), %10 = "m"(off) */ +#ifdef TRMMKERNEL + #if BACKWARDS == 1 + #define OFFSET_TO_K "movq %9,%%r13; subq %10,%%r13;" + #else + #define OFFSET_TO_K "movq %10,%%r13;" + #endif +#else + #define OFFSET_TO_K "movq %9,%%r13;" +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + #if BACKWARDS == 1 + #define START_UPDATE_OFFSET(ndim) {} + #define END_UPDATE_OFFSET(ndim) {off += (ndim);} + #else + #define START_UPDATE_OFFSET(ndim) {off += (ndim)>4 ? 4:(ndim);} + #define END_UPDATE_OFFSET(ndim) {off += (ndim)>4 ? ((ndim)-4):0;} + #endif +#else + #define START_UPDATE_OFFSET(ndim) {} + #define END_UPDATE_OFFSET(ndim) {} +#endif +#if defined(TRMMKERNEL) && defined(LEFT) + #if BACKWARDS == 1 + #define START_UPDATE_K(mdim) "" + #define END_UPDATE_K(mdim) "subq $"#mdim",%%r13;" + #else + #define START_UPDATE_K(mdim) "addq $"#mdim",%%r13;" + #define END_UPDATE_K(mdim) "" + #endif +#else + #define START_UPDATE_K(mdim) "" + #define END_UPDATE_K(mdim) "" +#endif #define COMPUTE(ndim) {\ - next_b = b_pointer + ndim * K;\ - __asm__ __volatile__(\ + next_b = b_pointer + ndim * K; START_UPDATE_OFFSET(ndim)\ + __asm__ __volatile__(\ "vbroadcastss %7,%%zmm0;"\ - "movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %8,%%r11;"\ + OFFSET_TO_K "movq %9,%%r12; salq $4,%%r12; movq %1,%%r14; movq %8,%%r11;"\ "cmpq $16,%%r11;jb 33101"#ndim"f;"\ "33109"#ndim":\n\t"\ - COMPUTE_m16(ndim)\ + START_UPDATE_K(16) COMPUTE_m16(ndim) END_UPDATE_K(16)\ "subq $16,%%r11;cmpq $16,%%r11;jnb 33109"#ndim"b;"\ "33101"#ndim":\n\t"\ "cmpq $8,%%r11;jb 33102"#ndim"f;"\ - COMPUTE_m8(ndim)\ + START_UPDATE_K(8) COMPUTE_m8(ndim) END_UPDATE_K(8)\ "subq $8,%%r11;"\ "33102"#ndim":\n\t"\ "cmpq $4,%%r11;jb 33103"#ndim"f;"\ - COMPUTE_m4(ndim)\ + START_UPDATE_K(4) COMPUTE_m4(ndim) END_UPDATE_K(4)\ "subq $4,%%r11;"\ "33103"#ndim":\n\t"\ "cmpq $2,%%r11;jb 33104"#ndim"f;"\ - COMPUTE_m2(ndim)\ + START_UPDATE_K(2) COMPUTE_m2(ndim) END_UPDATE_K(2)\ "subq $2,%%r11;"\ "33104"#ndim":\n\t"\ "testq %%r11,%%r11;jz 33105"#ndim"f;"\ - COMPUTE_m1(ndim)\ + START_UPDATE_K(1) COMPUTE_m1(ndim) END_UPDATE_K(1)\ "33105"#ndim":\n\t"\ - "movq %%r13,%4; movq %%r14,%1; vzeroupper;"\ - :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(next_b):"m"(ALPHA),"m"(M)\ - :"r10","r11","r12","r13","r14","r15","zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14",\ - "zmm15","zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31",\ - "cc","memory");\ - a_pointer -= M * K; b_pointer += ndim * K; c_pointer += LDC * ndim - M;\ + "movq %%r14,%1; vzeroupper;"\ + :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(k_counter),"+r"(ctemp),"+r"(next_b)\ + :"m"(ALPHA),"m"(M),"m"(K),"m"(off):"r10","r11","r12","r13","r14","r15","cc","memory",\ + "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15",\ + "zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31");\ + a_pointer -= M * K; b_pointer += ndim * K; c_pointer += LDC * ndim - M; END_UPDATE_OFFSET(ndim)\ } int __attribute__ ((noinline)) -CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC +#ifdef TRMMKERNEL +,BLASLONG offset +#endif +) { - if(m==0||n==0||k==0||alpha==(float)0.0) return 0; + if(m==0||n==0) return 0; int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float);float ALPHA = alpha; - int64_t M = (int64_t)m, K = (int64_t)k; + int64_t M = (int64_t)m, K = (int64_t)k, k_counter = K, off = 0; BLASLONG n_count = n; float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B; +#ifdef TRMMKERNEL + #ifdef LEFT + off = offset; + #else + off = -offset; + #endif +#endif for(;n_count>23;n_count-=24) COMPUTE(24) for(;n_count>19;n_count-=20) COMPUTE(20) for(;n_count>15;n_count-=16) COMPUTE(16) @@ -376,5 +674,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f if(n_count>0) COMPUTE(1) return 0; } -#include -#include "sgemm_direct_skylakex.c" +#ifndef TRMMKERNEL + #include + #include "sgemm_direct_skylakex.c" +#endif From f566787e6e15fb7c6fc563c1c3b5b66b865aeb77 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sun, 16 Feb 2020 22:58:44 +0800 Subject: [PATCH 28/75] Update KERNEL.SKYLAKEX --- kernel/x86_64/KERNEL.SKYLAKEX | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index dcd201649..9b3c83e42 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -1,7 +1,7 @@ include $(KERNELDIR)/KERNEL.HASWELL SGEMMKERNEL = sgemm_kernel_16x4_skylakex_2.c -STRMMKERNEL = sgemm_kernel_16x4_haswell.S +STRMMKERNEL = sgemm_kernel_16x4_skylakex_2.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_skylakex.c SGEMMONCOPY = sgemm_ncopy_4_skylakex.c From b0558c11b9b1eccdd84dbdee225dc41efb07a390 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sun, 16 Feb 2020 23:01:31 +0800 Subject: [PATCH 29/75] Update param.h --- param.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/param.h b/param.h index e6ab93aa5..e479314d9 100644 --- a/param.h +++ b/param.h @@ -1722,16 +1722,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_R xgemm_r #define XGEMM_DEFAULT_Q 128 -#define CGEMM3M_DEFAULT_UNROLL_N 8 -#define CGEMM3M_DEFAULT_UNROLL_M 4 -#define ZGEMM3M_DEFAULT_UNROLL_N 8 -#define ZGEMM3M_DEFAULT_UNROLL_M 2 +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 4 +#define ZGEMM3M_DEFAULT_UNROLL_M 4 -#define CGEMM3M_DEFAULT_P 448 -#define ZGEMM3M_DEFAULT_P 224 +#define CGEMM3M_DEFAULT_P 320 +#define ZGEMM3M_DEFAULT_P 256 #define XGEMM3M_DEFAULT_P 112 -#define CGEMM3M_DEFAULT_Q 224 -#define ZGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_Q 320 +#define ZGEMM3M_DEFAULT_Q 256 #define XGEMM3M_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_R 12288 #define ZGEMM3M_DEFAULT_R 12288 From d483e9270ab21e2171e33803ac7de60f2c5e0fae Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Feb 2020 17:29:35 +0100 Subject: [PATCH 30/75] Update KERNEL.POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 14de0c08d..ba9a99cd1 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -86,7 +86,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SMAXKERNEL = ../arm/max.c #DMAXKERNEL = ../arm/max.c # -#SMINKERNEL = ../arm/min.c +#SMINKERNEL = ../arm/min.c #DMINKERNEL = ../arm/min.c # ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") From e32f3b144766362dad5f9def1005076c9ea82b85 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Feb 2020 17:32:13 +0100 Subject: [PATCH 31/75] Restore -march flag for Android builds fixes #2419 - renewed discussion in #2112 suggests removal of the option was primarily aimed at non-Android builds --- Makefile.arm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.arm b/Makefile.arm index b5d80f8e6..fac6b56824 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -1,7 +1,7 @@ ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15)) ifeq ($(OSNAME), Android) -CCOMMON_OPT += -mfpu=neon -FCOMMON_OPT += -mfpu=neon +CCOMMON_OPT += -mfpu=neon -march=armv7-a +FCOMMON_OPT += -mfpu=neon -march=armv7-a else CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a From 4326dcb460c78a062743fd5d7c7f0ac520ff7b56 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 16 Feb 2020 15:11:40 -0600 Subject: [PATCH 32/75] Pass CFLAGS from env to Makefile.prebuild and remove iOS hack --- .travis.yml | 2 +- Makefile.prebuild | 8 ++++---- Makefile.system | 2 +- c_check | 28 ++++++++-------------------- 4 files changed, 14 insertions(+), 26 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9e18412e8..0f20aef5c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -176,7 +176,7 @@ matrix: - <<: *test-macos osx_image: xcode10.1 env: - - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk" + - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1" diff --git a/Makefile.prebuild b/Makefile.prebuild index a366004a1..b00f13368 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -42,7 +42,7 @@ all: getarch_2nd ./getarch_2nd 1 >> $(TARGET_CONF) config.h : c_check f_check getarch - perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS) + perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS) $(CFLAGS) ifneq ($(ONLY_CBLAS), 1) perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS) else @@ -59,13 +59,13 @@ endif getarch : getarch.c cpuid.S dummy $(CPUIDEMU) - $(HOSTCC) $(CFLAGS) $(EXFLAGS) -o $(@F) getarch.c cpuid.S $(CPUIDEMU) + $(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) -o $(@F) getarch.c cpuid.S $(CPUIDEMU) getarch_2nd : getarch_2nd.c config.h dummy ifndef TARGET_CORE - $(HOSTCC) -I. $(CFLAGS) -o $(@F) getarch_2nd.c + $(HOSTCC) -I. $(HOST_CFLAGS) -o $(@F) getarch_2nd.c else - $(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c + $(HOSTCC) -I. $(HOST_CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c endif dummy: diff --git a/Makefile.system b/Makefile.system index c0e45515f..cf9e9bafa 100644 --- a/Makefile.system +++ b/Makefile.system @@ -214,7 +214,7 @@ ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 # Generating Makefile.conf and config.h -DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) +DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf diff --git a/c_check b/c_check index 555b2eccf..c7899c84f 100644 --- a/c_check +++ b/c_check @@ -18,11 +18,12 @@ $binary = $ENV{"BINARY"}; $makefile = shift(@ARGV); $config = shift(@ARGV); -$compiler_name = join(" ", @ARGV); +$compiler_name = shift(@ARGV); +$flags = join(" ", @ARGV); # First, we need to know the target OS and compiler name -$data = `$compiler_name -E ctest.c`; +$data = `$compiler_name $flags -E ctest.c`; if ($?) { printf STDERR "C Compiler ($compiler_name) is something wrong.\n"; @@ -175,7 +176,7 @@ if ($defined == 0) { # Do again -$data = `$compiler_name -E ctest.c`; +$data = `$compiler_name $flags -E ctest.c`; if ($?) { printf STDERR "C Compiler ($compiler_name) is something wrong.\n"; @@ -195,7 +196,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { print $tmpf "void main(void){ __asm__ volatile($code); }\n"; $args = "$msa_flags -o $tmpf.o $tmpf"; - my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); + my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); system(@cmd) == 0; if ($? != 0) { $have_msa = 0; @@ -236,7 +237,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) { if ($compiler eq "PGI") { $args = " -tp skylake -c -o $tmpf.o $tmpf"; } - my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); + my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); system(@cmd) == 0; if ($? != 0) { $no_avx512 = 1; @@ -247,7 +248,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) { } } -$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; +$data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; $data =~ /globl\s([_\.]*)(.*)/; @@ -263,19 +264,6 @@ if ($architecture ne $hostarch) { $cross = 1 if ($os ne $hostos); -# rework cross suffix and architecture if we are on OSX cross-compiling for ARMV8-based IOS -# the initial autodetection will have been confused by the command-line arguments to clang -# and the cross-compiler apparently still claims to build for x86_64 in its CC -E output -if (($os eq "Darwin") && ($cross_suffix ne "")) { - my $tmpnam = `xcrun --sdk iphoneos --find clang`; - $cross_suffix = substr($tmpnam, 0, rindex($tmpnam, "/")+1 ); -# this should produce something like $cross_suffix="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/"; - $cross =1; - $architecture = arm64; -} - - - $openmp = "" if $ENV{USE_OPENMP} != 1; $linker_L = ""; @@ -283,7 +271,7 @@ $linker_l = ""; $linker_a = ""; { - $link = `$compiler_name -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`; + $link = `$compiler_name $flags -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $flags $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`; $link =~ s/\-Y\sP\,/\-Y/g; From 0e7f43c898ea646b8bfec982657e77ac09e9f118 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Fri, 14 Feb 2020 10:35:51 +0100 Subject: [PATCH 33/75] Add missing USE_MIN in kernel/CMakeLists.txt. --- kernel/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index b3310e87e..35e0fff25 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -47,7 +47,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}MAXKERNEL}" "" "max_k" false "" "" false ${float_type}) endif () if (DEFINED ${float_char}MINKERNEL) - GenerateNamedObjects("${KERNELDIR}/${${float_char}MINKERNEL}" "" "min_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}MINKERNEL}" "USE_MIN" "min_k" false "" "" false ${float_type}) endif () GenerateNamedObjects("${KERNELDIR}/${I${float_char}AMAXKERNEL}" "USE_ABS" "i*amax_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${I${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k" false "" "" false ${float_type}) @@ -55,7 +55,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${I${float_char}MAXKERNEL}" "" "i*max_k" false "" "" false ${float_type}) endif () if (DEFINED I${float_char}MINKERNEL) - GenerateNamedObjects("${KERNELDIR}/${I${float_char}MINKERNEL}" "" "i*min_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${I${float_char}MINKERNEL}" "USE_MIN" "i*min_k" false "" "" false ${float_type}) endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}ASUMKERNEL}" "" "asum_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "" "axpy_k" false "" "" false ${float_type}) From 18bcc36a6996d117c156394b047ec1c0e298e202 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Thu, 13 Feb 2020 14:32:24 +0100 Subject: [PATCH 34/75] Fix implementation of iamax_sse.S as reported in #2116. The was a typo in iamax_sse.S where one of the comparison was cmpeqps instead of cmpeqss. That misdetected index for sequences where the minimum value was 0. --- kernel/x86_64/KERNEL | 4 +- kernel/x86_64/iamax_sse.S | 6 +-- utest/CMakeLists.txt | 1 + utest/Makefile | 2 +- utest/test_ismin.c | 89 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 94 insertions(+), 8 deletions(-) create mode 100644 utest/test_ismin.c diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 92d121ab2..4874711bb 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -171,7 +171,7 @@ IXAMAXKERNEL = izamax.S endif ifndef ISAMINKERNEL -ISAMINKERNEL = iamax.S +ISAMINKERNEL = iamax_sse.S endif ifndef IDAMINKERNEL @@ -207,7 +207,7 @@ IQMAXKERNEL = iamax.S endif ifndef ISMINKERNEL -ISMINKERNEL = iamax.S +ISMINKERNEL = iamax_sse.S endif ifndef IDMINKERNEL diff --git a/kernel/x86_64/iamax_sse.S b/kernel/x86_64/iamax_sse.S index d50c1699c..9c7af1fd7 100644 --- a/kernel/x86_64/iamax_sse.S +++ b/kernel/x86_64/iamax_sse.S @@ -36,10 +36,6 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -/* This kernel was found to give wrong results when used for ISMIN/ISAMIN - with increment != 1, although it appears to be correct for corresponding - MAX operations. See issue 2116 */ - #define ASSEMBLER #include "common.h" @@ -863,7 +859,7 @@ #ifdef USE_ABS andps %xmm15, %xmm5 #endif - cmpeqps %xmm0, %xmm5 + cmpeqss %xmm0, %xmm5 movss 0 * SIZE(X), %xmm6 addq INCX, X diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 1e3051a8f..544646911 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -7,6 +7,7 @@ else () set(OpenBLAS_utest_src utest_main.c test_amax.c + test_ismin.c test_rotmg.c test_rot.c test_axpy.c diff --git a/utest/Makefile b/utest/Makefile index bd4bdf3ae..32bdcc6e1 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -11,7 +11,7 @@ UTESTBIN=openblas_utest include $(TOPDIR)/Makefile.system -OBJS=utest_main.o test_amax.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o +OBJS=utest_main.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o ifneq ($(NO_LAPACK), 1) diff --git a/utest/test_ismin.c b/utest/test_ismin.c new file mode 100644 index 000000000..f23d6b545 --- /dev/null +++ b/utest/test_ismin.c @@ -0,0 +1,89 @@ +/***************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "openblas_utest.h" + +#define ELEMENTS 50 +#define INCREMENT 2 + +CTEST(ismin, positive_step_2){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 0; + blasint index = BLASFUNC(ismin)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} + +CTEST(ismin, negative_step_2){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -123456.0f; + blasint index = BLASFUNC(ismin)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} + +CTEST(ismax, positive_step_2){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 123456.0f; + blasint index = BLASFUNC(ismax)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} + +CTEST(ismax, negative_step_2){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = 0; + blasint index = BLASFUNC(ismax)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} From aeea14ee4096860a75818547931c9dae43f965da Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Thu, 13 Feb 2020 14:42:45 +0100 Subject: [PATCH 35/75] Come up with LOAD_AND_COMPARE_TO_MXX macro in iamax_sse.S. --- kernel/x86_64/iamax_sse.S | 72 +++++++++------------------------------ 1 file changed, 17 insertions(+), 55 deletions(-) diff --git a/kernel/x86_64/iamax_sse.S b/kernel/x86_64/iamax_sse.S index 9c7af1fd7..4f62b9be2 100644 --- a/kernel/x86_64/iamax_sse.S +++ b/kernel/x86_64/iamax_sse.S @@ -55,6 +55,15 @@ #define MAXSS minss #endif +.macro LOAD_AND_COMPARE_TO_MXX REG + movss 0 * SIZE(X), \REG + addq INCX, X +#ifdef USE_ABS + andps %xmm15, \REG +#endif + cmpeqss %xmm0, \REG +.endm + #include "l1param.h" PROLOGUE @@ -826,61 +835,14 @@ ALIGN_4 .L93: - movss 0 * SIZE(X), %xmm1 - addq INCX, X -#ifdef USE_ABS - andps %xmm15, %xmm1 -#endif - cmpeqss %xmm0, %xmm1 - - movss 0 * SIZE(X), %xmm2 - addq INCX, X -#ifdef USE_ABS - andps %xmm15, %xmm2 -#endif - cmpeqss %xmm0, %xmm2 - - movss 0 * SIZE(X), %xmm3 - addq INCX, X -#ifdef USE_ABS - andps %xmm15, %xmm3 -#endif - cmpeqss %xmm0, %xmm3 - - movss 0 * SIZE(X), %xmm4 - addq INCX, X -#ifdef USE_ABS - andps %xmm15, %xmm4 -#endif - cmpeqss %xmm0, %xmm4 - - movss 0 * SIZE(X), %xmm5 - addq INCX, X -#ifdef USE_ABS - andps %xmm15, %xmm5 -#endif - cmpeqss %xmm0, %xmm5 - - movss 0 * SIZE(X), %xmm6 - addq INCX, X -#ifdef USE_ABS - andps %xmm15, %xmm6 -#endif - cmpeqss %xmm0, %xmm6 - - movss 0 * SIZE(X), %xmm7 - addq INCX, X -#ifdef USE_ABS - andps %xmm15, %xmm7 -#endif - cmpeqss %xmm0, %xmm7 - - movss 0 * SIZE(X), %xmm8 - addq INCX, X -#ifdef USE_ABS - andps %xmm15, %xmm8 -#endif - cmpeqss %xmm0, %xmm8 + LOAD_AND_COMPARE_TO_MXX %xmm1 + LOAD_AND_COMPARE_TO_MXX %xmm2 + LOAD_AND_COMPARE_TO_MXX %xmm3 + LOAD_AND_COMPARE_TO_MXX %xmm4 + LOAD_AND_COMPARE_TO_MXX %xmm5 + LOAD_AND_COMPARE_TO_MXX %xmm6 + LOAD_AND_COMPARE_TO_MXX %xmm7 + LOAD_AND_COMPARE_TO_MXX %xmm8 orps %xmm2, %xmm1 orps %xmm4, %xmm3 From 2c242b4cefbd9e9d238fe50cb6ee90ef72f4e561 Mon Sep 17 00:00:00 2001 From: Izaak Beekman Date: Mon, 17 Feb 2020 11:49:53 -0500 Subject: [PATCH 36/75] Add Github Action to build development branch nightly with Homebrew --- .github/workflows/nightly-Homebrew-build.yml | 63 ++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 .github/workflows/nightly-Homebrew-build.yml diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml new file mode 100644 index 000000000..a5778e2eb --- /dev/null +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -0,0 +1,63 @@ +on: + push: + pull_request: + branches: + - develop + schedule: + - cron: 45 7 * * * + +name: Nightly-Homebrew-Build +jobs: + build-OpenBLAS-with-Homebrew: + runs-on: macos-latest + env: + HOMEBREW_DEVELOPER: "ON" + HOMEBREW_DISPLAY_INSTALL_TIMES: "ON" + HOMEBREW_NO_ANALYTICS: "ON" + HOMEBREW_NO_AUTO_UPDATE: "ON" + HOMEBREW_NO_BOTTLE_SOURCE_FALLBACK: "ON" + HOMEBREW_NO_INSTALL_CLEANUP: "ON" + + steps: + - name: Random delay for cron job + run: | + delay=$(( RANDOM % 600 )) + printf 'Delaying for %s seconds on event %s' ${delay} "${{ github.event_name }}" + sleep ${delay} + if: github.event_name == 'schedule' + + - uses: actions/checkout@v2 + + - name: Update Homebrew + if: github.event_name != 'pull_request' + run: brew update || true + + - name: Install prerequisites + run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas + + - name: Install and bottle OpenBLAS + run: brew install --fetch-HEAD --HEAD --build-bottle --keep-tmp openblas + + - name: Create bottle + run: brew bottle -v openblas + + - name: Upload bottle + uses: actions/upload-artifact@v1 + with: + name: openblas--HEAD.catalina.bottle.tar.gz + paht: ./*.bottle.* + + - name: Show linkage + run: brew linkage -v openblas + + - name: Test openblas + run: brew test --HEAD --verbose openblas + + - name: Audit openblas formula + run: | + brew audit --strict openblas + brew cat openblas + + - name: Post logs on failure + if: failure() + run: brew gist-logs --with-hostname -v openblas From 0b4480216421b7ea6f78b9c92bd2233e52ff79ab Mon Sep 17 00:00:00 2001 From: Izaak Beekman Date: Mon, 17 Feb 2020 13:12:50 -0500 Subject: [PATCH 37/75] Test push & PRs only when workflow file changes Also, add comments to clarify what the test is testing --- .github/workflows/nightly-Homebrew-build.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index a5778e2eb..b55ae9daf 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -1,10 +1,20 @@ +# Only the "head" branch of the OpenBLAS package is tested + on: push: + paths: + - '**/nightlyHomebrew-build.yml' pull_request: branches: - develop + paths: + - '**/nightly-Homebrew-build.yml' schedule: - cron: 45 7 * * * +# This is 7:45 AM UTC daily, late at night in the USA + +# Since push and pull_request will still always be building and testing the `develop` branch, +# it only makes sense to test if this file has been changed name: Nightly-Homebrew-Build jobs: @@ -27,6 +37,7 @@ jobs: if: github.event_name == 'schedule' - uses: actions/checkout@v2 + # This isn't even needed, technically. Homebrew will get `develop` via git - name: Update Homebrew if: github.event_name != 'pull_request' @@ -37,6 +48,7 @@ jobs: - name: Install and bottle OpenBLAS run: brew install --fetch-HEAD --HEAD --build-bottle --keep-tmp openblas + # the HEAD flags tell Homebrew to build the develop branch fetch via git - name: Create bottle run: brew bottle -v openblas From 1a88c4ab263d1d4e73bfb5721fb2e2c803a6c1af Mon Sep 17 00:00:00 2001 From: Izaak Beekman Date: Mon, 17 Feb 2020 13:32:33 -0500 Subject: [PATCH 38/75] Fix bottle upload problem & typo --- .github/workflows/nightly-Homebrew-build.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index b55ae9daf..f55e73d23 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -3,7 +3,7 @@ on: push: paths: - - '**/nightlyHomebrew-build.yml' + - '**/nightly-Homebrew-build.yml' pull_request: branches: - develop @@ -51,13 +51,16 @@ jobs: # the HEAD flags tell Homebrew to build the develop branch fetch via git - name: Create bottle - run: brew bottle -v openblas + run: | + brew bottle -v openblas + mkdir bottles + mv *.bottle.tar.gz bottles - name: Upload bottle uses: actions/upload-artifact@v1 with: name: openblas--HEAD.catalina.bottle.tar.gz - paht: ./*.bottle.* + path: bottles - name: Show linkage run: brew linkage -v openblas From 9f39f0a2c3df9d517932c9bd808755bbf5551383 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Feb 2020 19:55:39 +0100 Subject: [PATCH 39/75] Specify ismin/ismax assembly kernels for POWER8 directly to fix utest failure in new ismin test - Makefile.L1 defaults look wrong --- kernel/power/KERNEL.POWER8 | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index ba9a99cd1..d3ae5def4 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -88,7 +88,10 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c # #SMINKERNEL = ../arm/min.c #DMINKERNEL = ../arm/min.c -# + +ISMINKERNEL = imin.S +ISMAXKERNEL = imax.S + ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") ifneq ($(GCCVERSIONGTEQ9),1) ISAMAXKERNEL = isamax_power8.S From 130c1741e5bba4bffb84e0057149b9c7b353bdee Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Tue, 18 Feb 2020 10:22:49 -0800 Subject: [PATCH 40/75] Fix install name on osx again --- Makefile.install | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.install b/Makefile.install index e01d866c9..2dc32c3d9 100644 --- a/Makefile.install +++ b/Makefile.install @@ -82,7 +82,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) endif ifeq ($(OSNAME), Darwin) @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" - @-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" + @-install_name_tool -id "$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(MAJOR_VERSION).dylib" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \ ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib From 76b2cec6ce9a80adde9ffe6ad31f5771e8dc26ec Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Feb 2020 18:08:20 +0100 Subject: [PATCH 41/75] Get endianness into Makefile variable --- Makefile.system | 1 + getarch.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/Makefile.system b/Makefile.system index cf9e9bafa..2073f56bb 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1328,6 +1328,7 @@ export OSNAME export ARCH export CORE export LIBCORE +export __BYTE_ORDER__ export PGCPATH export CONFIG export CC diff --git a/getarch.c b/getarch.c index 1f590390a..53748897f 100644 --- a/getarch.c +++ b/getarch.c @@ -1298,6 +1298,13 @@ int main(int argc, char *argv[]){ #endif #endif +#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n"); +#endif +#if defined(__BIG_ENDIAN__) && __BIG_ENDIAN__ > 0 +printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n"); +#endif + #ifdef MAKE_NB_JOBS #if MAKE_NB_JOBS > 0 printf("MAKE += -j %d\n", MAKE_NB_JOBS); From 0b39cf95b022c3902cf78e4f3df2d11468f8a7fe Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Feb 2020 18:09:54 +0100 Subject: [PATCH 42/75] Fix endianness conditionals --- kernel/power/KERNEL.POWER8 | 10 +++++----- kernel/power/KERNEL.PPC440 | 8 ++++---- kernel/power/KERNEL.PPC970 | 6 +++--- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index d3ae5def4..c7867012b 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -92,7 +92,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ISMINKERNEL = imin.S ISMAXKERNEL = imax.S -ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) ifneq ($(GCCVERSIONGTEQ9),1) ISAMAXKERNEL = isamax_power8.S else @@ -104,7 +104,7 @@ endif # IDAMAXKERNEL = idamax.c # -ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) ifneq ($(GCCVERSIONGTEQ9),1) ICAMAXKERNEL = icamax_power8.S else @@ -116,7 +116,7 @@ endif # IZAMAXKERNEL = izamax.c # -ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) ifneq ($(GCCVERSIONGTEQ9),1) ISAMINKERNEL = isamin_power8.S else @@ -128,7 +128,7 @@ endif # IDAMINKERNEL = idamin.c # -ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) ifneq ($(GCCVERSIONGTEQ9),1) ICAMINKERNEL = icamin_power8.S else @@ -153,7 +153,7 @@ ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c -ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) ifneq ($(GCCVERSIONGTEQ9),1) CAXPYKERNEL = caxpy_power8.S else diff --git a/kernel/power/KERNEL.PPC440 b/kernel/power/KERNEL.PPC440 index 8d64d3fc1..677af5f21 100644 --- a/kernel/power/KERNEL.PPC440 +++ b/kernel/power/KERNEL.PPC440 @@ -15,7 +15,7 @@ ZASUMKERNEL = zasum_ppc440.S SAXPYKERNEL = axpy_ppc440.S DAXPYKERNEL = axpy_ppc440.S -ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) CAXPYKERNEL = ../arm/zaxpy.c ZAXPYKERNEL = ../arm/zaxpy.c else @@ -25,7 +25,7 @@ endif SDOTKERNEL = dot_ppc440.S DDOTKERNEL = dot_ppc440.S -ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) CDOTKERNEL = zdot_ppc440.S ZDOTKERNEL = zdot_ppc440.S else @@ -62,7 +62,7 @@ ZNRM2KERNEL = znrm2_ppc440.S SROTKERNEL = rot_ppc440.S DROTKERNEL = rot_ppc440.S -ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) CROTKERNEL = zrot_ppc440.S ZROTKERNEL = zrot_ppc440.S else @@ -132,7 +132,7 @@ ZTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S -ifeq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = ../arm/gemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c diff --git a/kernel/power/KERNEL.PPC970 b/kernel/power/KERNEL.PPC970 index cc1f215de..a99fb7d96 100644 --- a/kernel/power/KERNEL.PPC970 +++ b/kernel/power/KERNEL.PPC970 @@ -1,4 +1,4 @@ -ifeq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) SGEMMKERNEL = gemm_kernel.S SGEMMINCOPY = SGEMMITCOPY = @@ -30,7 +30,7 @@ DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -ifeq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) CGEMMKERNEL = zgemm_kernel.S CGEMMINCOPY = CGEMMITCOPY = @@ -72,7 +72,7 @@ ZTRSMKERNEL_LT = ztrsm_kernel_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_RT.S -ifeq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) STRSMKERNEL_LN = trsm_kernel_LN.S STRSMKERNEL_LT = trsm_kernel_LT.S STRSMKERNEL_RN = trsm_kernel_LT.S From e8d82c01d4b78612991569e7734509c7f8dc7936 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Feb 2020 18:49:13 +0100 Subject: [PATCH 43/75] Recognize Ampere EMAG8180 --- cpuid_arm64.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 9e019fe3e..5868af75c 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -41,6 +41,8 @@ #define CPU_THUNDERX2T99 8 //Hisilicon #define CPU_TSV110 9 +// Ampere +#define CPU_EMAG8180 10 static char *cpuname[] = { "UNKNOWN", @@ -52,7 +54,8 @@ static char *cpuname[] = { "FALKOR", "THUNDERX", "THUNDERX2T99", - "TSV110" + "TSV110", + "EMAG8180" }; static char *cpuname_lower[] = { @@ -65,7 +68,8 @@ static char *cpuname_lower[] = { "falkor", "thunderx", "thunderx2t99", - "tsv110" + "tsv110", + "emag8180" }; int get_feature(char *search) @@ -152,6 +156,9 @@ int detect(void) // HiSilicon else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01")) return CPU_TSV110; + // Ampere + else if (strstr(cpu_implementer, "0x50") && strstr(cpu_part, "0x000")) + return CPU_EMAG8180; } p = (char *) NULL ; @@ -335,6 +342,18 @@ void get_cpuconfig(void) printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_SIZE 4096 \n"); break; + + case CPU_EMAG8180: + // Minimum parameters for ARMv8 (based on A53) + printf("#define EMAG8180\n"); + printf("#define L1_CODE_SIZE 32768\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 262144\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + } get_cpucount(); } From 71e5669c3ea60ddc524ef16f5066d88b8ab37b04 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Feb 2020 18:57:26 +0100 Subject: [PATCH 44/75] Add preliminary support for EMAG8180 ARMV8 processor --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index e6ab93aa5..055749dc1 100644 --- a/param.h +++ b/param.h @@ -2603,7 +2603,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(CORTEXA53) || defined(CORTEXA57) || \ defined(CORTEXA72) || defined(CORTEXA73) || \ - defined(FALKOR) || defined(TSV110) + defined(FALKOR) || defined(TSV110) || defined(EMAG8180) #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 From e57b11accae62804f0d7a7fe2d9f15a9e19932c3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Feb 2020 19:00:28 +0100 Subject: [PATCH 45/75] Add preliminary support for EMAG8180 --- kernel/arm64/KERNEL.EMAG8180 | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 kernel/arm64/KERNEL.EMAG8180 diff --git a/kernel/arm64/KERNEL.EMAG8180 b/kernel/arm64/KERNEL.EMAG8180 new file mode 100644 index 000000000..007b2ce26 --- /dev/null +++ b/kernel/arm64/KERNEL.EMAG8180 @@ -0,0 +1,3 @@ +include $(KERNELDIR)/KERNEL.CORTEXA57 + + From 4046985913ad8b62fc997da2cd60447af4bdf093 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 21 Feb 2020 11:55:52 +0100 Subject: [PATCH 46/75] Add proper defaults for IxMIN/IxMAX kernels the fallbacks from Makefile.L1 assume a combined source for absolute value and non-absolute (with ifdef USE_ABS) but here we have separate implementations --- kernel/power/KERNEL | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/kernel/power/KERNEL b/kernel/power/KERNEL index c3c86b310..9070450f4 100644 --- a/kernel/power/KERNEL +++ b/kernel/power/KERNEL @@ -50,3 +50,26 @@ ifndef DSDOTKERNEL DSDOTKERNEL = ../generic/dot.c endif +ifndef ISMINKERNEL +ISMINKERNEL = imin.S +endif + +ifndef IDMINKERNEL +IDMINKERNEL = imin.S +endif + +ifndef IQMINKERNEL +IQMINKERNEL = imin.S +endif + +ifndef ISMAXKERNEL +ISMAXKERNEL = imax.S +endif + +ifndef IDMAXKERNEL +IDMAXKERNEL = imax.S +endif + +ifndef IQMAXKERNEL +IQMAXKERNEL = imax.S +endif From 07454bf4d55868a388733a4377fedbd8a56f15c2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 21 Feb 2020 11:58:15 +0100 Subject: [PATCH 47/75] Add proper defaults for IxMIN/IxMAX kernels the fallbacks from Makefile.L1 assume a combined source for absolute value and non-absolute (with ifdef USE_ABS) but here we have separate implementations --- kernel/mips64/KERNEL | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index 61da7445f..97ef3692c 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -167,3 +167,27 @@ endif CGEMM3MKERNEL = zgemm3m_kernel.S ZGEMM3MKERNEL = zgemm3m_kernel.S + +ifndef ISMINKERNEL +ISMINKERNEL = imin.S +endif + +ifndef IDMINKERNEL +IDMINKERNEL = imin.S +endif + +ifndef IQMINKERNEL +IQMINKERNEL = imin.S +endif + +ifndef ISMAXKERNEL +ISMAXKERNEL = imax.S +endif + +ifndef IDMAXKERNEL +IDMAXKERNEL = imax.S +endif + +ifndef IQMAXKERNEL +IQMAXKERNEL = imax.S +endif From f6fcbd7906acb6bbfffad49086863ffb0ba014da Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 22 Feb 2020 23:37:45 +0800 Subject: [PATCH 48/75] Fix performance bug when LDC is a multiple of 1024 --- sgemm_kernel_8x4_haswell_2.c | 424 +++++++++++++++++++++++++++++++++++ 1 file changed, 424 insertions(+) create mode 100644 sgemm_kernel_8x4_haswell_2.c diff --git a/sgemm_kernel_8x4_haswell_2.c b/sgemm_kernel_8x4_haswell_2.c new file mode 100644 index 000000000..5ab3e6d1f --- /dev/null +++ b/sgemm_kernel_8x4_haswell_2.c @@ -0,0 +1,424 @@ +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */ +/* r10 = tmp, r11 = m_counter, r12 = k << 2(const), r13 = tmp, r14 = b_head_pos(const), r15 = tmp */ + +/* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */ +#define KERNEL_k1m8n1 \ + "vmovups (%0),%%ymm1; addq $32,%0;"\ + "vbroadcastss (%1),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\ + "addq $4,%1;" +#define KERNEL_h_k1m8n2 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ + "vbroadcastsd (%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;" +#define KERNEL_k1m8n2 KERNEL_h_k1m8n2 "addq $8,%1;" +#define KERNEL_h_k1m8n4 \ + KERNEL_h_k1m8n2 "vbroadcastsd 8(%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;" +#define KERNEL_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;" +#define unit_kernel_k1m8n4(c1,c2,c3,c4,boff,...) \ + "vbroadcastsd "#boff"("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\ + "vbroadcastsd "#boff"+8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";" +#define KERNEL_h_k1m8n8 KERNEL_h_k1m8n4 unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,%1,%%r12,4) +#define KERNEL_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%1;" +#define KERNEL_h_k1m8n12 KERNEL_h_k1m8n8 unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,%1,%%r12,8) +#define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%1;" +#define KERNEL_k2m8n4 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0);"\ + unit_kernel_k1m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,0,%1)\ + "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; addq $64,%0;"\ + unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,16,%1)\ + "addq $32,%1;" +#define KERNEL_L_k1m8n6 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0); addq $32,%0;"\ + "vbroadcastsd (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastsd 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastsd (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "addq $16,%1;" +#define KERNEL_L_k2m8n6 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0);"\ + "vbroadcastsd (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastsd 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastsd (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; addq $64,%0;"\ + "vbroadcastsd 16(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastsd 24(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastsd 16(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $32,%1;" +#define KERNEL_L_k1m16n6 \ + "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8); addq $32,%0;"\ + "vbroadcastss (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 4(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 12(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 4(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $16,%1;" +#define KERNEL_L_k2m16n6 \ + "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8);"\ + "vbroadcastss (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 4(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 12(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 4(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "vmovups 32(%0),%%ymm1; vmovups 32(%0,%%r12,8),%%ymm2; addq $64,%0;"\ + "vbroadcastss 16(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 20(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss 24(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 28(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss 16(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 20(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $32,%1;" +#define KERNEL_R_k1m16n6 \ + "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8); addq $32,%0;"\ + "vbroadcastss 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 12(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 4(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 12(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $16,%1;" +#define KERNEL_R_k2m16n6 \ + "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8);"\ + "vbroadcastss 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 12(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 4(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 12(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "vmovups 32(%0),%%ymm1; vmovups 32(%0,%%r12,8),%%ymm2; addq $64,%0;"\ + "vbroadcastss 24(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 28(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss 16(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 20(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss 24(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 28(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $32,%1;" +#define KERNEL_R_k1m8n6 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0); addq $32,%0;"\ + "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastsd (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "addq $16,%1;" +#define KERNEL_R_k2m8n6 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0);"\ + "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastsd (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; addq $64,%0;"\ + "vbroadcastsd 24(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastsd 16(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastsd 24(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $32,%1;" +#define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;" +#define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;" +#define unit_init_m8n4(c1,c2,c3,c4) \ + "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" +#define INIT_m8n8 unit_init_m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7) unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11) +#define INIT_m8n4 INIT_m8n8 +#define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15) +#define INIT_m8n6 INIT_m8n12 +#define INIT_m16n6 INIT_m8n12 +#define SAVE_m8n1 "vfmadd213ps (%2),%%ymm0,%%ymm4; vmovups %%ymm4,(%2);" +#define unit_save_m8n2(c1,c2) \ + "vunpcklps "#c2","#c1",%%ymm2; vunpckhps "#c2","#c1",%%ymm3; vunpcklpd %%ymm3,%%ymm2,"#c1"; vunpckhpd %%ymm3,%%ymm2,"#c2";"\ + "vfmadd213ps (%5),%%ymm0,"#c1"; vfmadd213ps (%5,%3,1),%%ymm0,"#c2"; vmovups "#c1",(%5); vmovups "#c2",(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5) +#define SAVE_m8n4 "movq %2,%5;"\ + "vaddps %%ymm4,%%ymm8,%%ymm4; vaddps %%ymm5,%%ymm9,%%ymm5; vaddps %%ymm6,%%ymm10,%%ymm6; vaddps %%ymm7,%%ymm11,%%ymm7;"\ + unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) +#define SAVE_m8n8 "movq %2,%5;"\ + unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11) +#define SAVE_m8n12 SAVE_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15) +#define unit_save_m16n2(c1,c2,c3,c4) \ + "vfmadd213ps (%5),%%ymm0,"#c1"; vfmadd213ps 32(%5),%%ymm0,"#c2"; vmovups "#c1",(%5); vmovups "#c2",32(%5);"\ + "vfmadd213ps (%5,%3,1),%%ymm0,"#c3"; vfmadd213ps 32(%5,%3,1),%%ymm0,"#c4"; vmovups "#c3",(%5,%3,1); vmovups "#c4",32(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_L_m16n6 "movq %2,%5;"\ + unit_save_m16n2(%%ymm4,%%ymm5,%%ymm6,%%ymm7) unit_save_m16n2(%%ymm8,%%ymm9,%%ymm10,%%ymm11) unit_save_m16n2(%%ymm12,%%ymm13,%%ymm14,%%ymm15) +#define SAVE_R_m16n6 "leaq (%2,%3,4),%5; leaq (%5,%3,2),%5;"\ + unit_save_m16n2(%%ymm4,%%ymm5,%%ymm6,%%ymm7) unit_save_m16n2(%%ymm8,%%ymm9,%%ymm10,%%ymm11) unit_save_m16n2(%%ymm12,%%ymm13,%%ymm14,%%ymm15) +#define SAVE_L_m8n6 "movq %2,%5;"\ + "vaddps %%ymm4,%%ymm10,%%ymm4; vaddps %%ymm5,%%ymm11,%%ymm5; vaddps %%ymm6,%%ymm12,%%ymm6;"\ + "vaddps %%ymm7,%%ymm13,%%ymm7; vaddps %%ymm8,%%ymm14,%%ymm8; vaddps %%ymm9,%%ymm15,%%ymm9;"\ + unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) unit_save_m8n2(%%ymm8,%%ymm9) +#define SAVE_R_m8n6 "leaq (%2,%3,4),%5; leaq (%5,%3,2),%5;"\ + "vaddps %%ymm4,%%ymm10,%%ymm4; vaddps %%ymm5,%%ymm11,%%ymm5; vaddps %%ymm6,%%ymm12,%%ymm6;"\ + "vaddps %%ymm7,%%ymm13,%%ymm7; vaddps %%ymm8,%%ymm14,%%ymm8; vaddps %%ymm9,%%ymm15,%%ymm9;"\ + unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) unit_save_m8n2(%%ymm8,%%ymm9) + +/* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */ +#define KERNEL_k1m4n1 \ + "vmovups (%0),%%xmm1; addq $16,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "addq $4,%1;" +#define KERNEL_h_k1m4n2 \ + "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\ + "vmovddup (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;" +#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $8,%1;" +#define KERNEL_h_k1m4n4 \ + KERNEL_h_k1m4n2 "vmovddup 8(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;" +#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;" +#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \ + "vmovddup ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\ + "vmovddup 8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";" +#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,%1,%%r12,4) +#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%1;" +#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,%1,%%r12,8) +#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%1;" +#define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;" +#define unit_init_m4n4(c1,c2,c3,c4) \ + "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" +#define INIT_m4n8 INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11) +#define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15) +#define SAVE_m4n1 "vfmadd213ps (%2),%%xmm0,%%xmm4; vmovups %%xmm4,(%2);" +#define unit_save_m4n2(c1,c2) \ + "vunpcklps "#c2","#c1",%%xmm2; vunpckhps "#c2","#c1",%%xmm3; vunpcklpd %%xmm3,%%xmm2,"#c1"; vunpckhpd %%xmm3,%%xmm2,"#c2";"\ + "vfmadd213ps (%5),%%xmm0,"#c1"; vmovups "#c1",(%5);"\ + "vfmadd213ps (%5,%3,1),%%xmm0,"#c2"; vmovups "#c2",(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#define SAVE_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5) +#define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(%%xmm6,%%xmm7) +#define SAVE_m4n8 SAVE_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11) +#define SAVE_m4n12 SAVE_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15) + +/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */ +#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define KERNEL_k1m2n1 \ + "vmovsd (%0),%%xmm1; addq $8,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "addq $4,%1;" +#define SAVE_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" +#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define KERNEL_k1m2n2 \ + "vmovsd (%0),%%xmm1; addq $8,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\ + "addq $8,%1;" +#define SAVE_m2n2 SAVE_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);" +#define INIT_m2n4 INIT_m2n2 +#define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;" +#define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;" +#define KERNEL_k1m2n4 \ + "vmovups (%1),%%xmm3; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ + "addq $8,%0;" +#define KERNEL_k1m2n8 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\ + "vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\ + "addq $8,%0;" +#define KERNEL_k1m2n12 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; vmovups (%1,%%r12,8),%%xmm1; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\ + "vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\ + "addq $8,%0;" +#define unit_save_m2n4(c1,c2) \ + "vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\ + "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ + "vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"\ + "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ + "vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5) +#define SAVE_m2n8 SAVE_m2n4 unit_save_m2n4(%%xmm6,%%xmm7) +#define SAVE_m2n12 SAVE_m2n8 unit_save_m2n4(%%xmm8,%%xmm9) + +/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm6 for accumulators */ +#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define KERNEL_k1m1n1 \ + "vmovss (%1),%%xmm3; addq $4,%1;"\ + "vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define SAVE_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);" +#define INIT_m1n2 INIT_m1n1 +#define KERNEL_k1m1n2 \ + "vmovsd (%1),%%xmm3; addq $8,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define SAVE_m1n2 \ + "vmovss (%2),%%xmm3; vinsertps $16,(%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\ + "vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);" +#define INIT_m1n4 INIT_m1n2 +#define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;" +#define KERNEL_k1m1n4 \ + "vmovups (%1),%%xmm3; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define KERNEL_k1m1n8 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\ + "addq $4,%0;" +#define KERNEL_k1m1n12 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; vmovups (%1,%%r12,8),%%xmm1; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\ + "addq $4,%0;" +#define unit_save_m1n4(c1) \ + "vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\ + "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ + "vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\ + "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ + "vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4) +#define SAVE_m1n8 SAVE_m1n4 unit_save_m1n4(%%xmm5) +#define SAVE_m1n12 SAVE_m1n8 unit_save_m1n4(%%xmm6) + +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */ +/* r10 = tmp, r11 = m_counter, r12 = k << 2(const), r13 = tmp, r14 = b_head_pos(const), r15 = tmp */ + +#define COMPUTE_SIMPLE(mdim,ndim) \ + "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m##mdim##n##ndim\ + "testq %4,%4; jz 7"#mdim"7"#ndim"2f;"\ + "7"#mdim"7"#ndim"1:\n\t"\ + KERNEL_k1m##mdim##n##ndim "decq %4; jnz 7"#mdim"7"#ndim"1b;"\ + "7"#mdim"7"#ndim"2:\n\t"\ + SAVE_m##mdim##n##ndim "addq $"#mdim"*4,%2;" +#define COMPUTE_m8n1 COMPUTE_SIMPLE(8,1) +#define COMPUTE_m8n2 COMPUTE_SIMPLE(8,2) +#define COMPUTE_m8n8 COMPUTE_SIMPLE(8,8) +#define COMPUTE_m8n12 COMPUTE_SIMPLE(8,12) +#define COMPUTE_m8n4 \ + "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m8n4\ + "cmpq $8,%4; jb 78740f;"\ + "78749:\n\t"\ + KERNEL_k2m8n4 KERNEL_k2m8n4 KERNEL_k2m8n4 KERNEL_k2m8n4\ + "subq $8,%4; cmpq $8,%4; jnb 78749b;"\ + "78740:\n\t"\ + "testq %4,%4; jz 78742f;"\ + "78741:\n\t"\ + KERNEL_k1m8n4 "decq %4; jnz 78741b;"\ + "78742:\n\t"\ + SAVE_m8n4 "addq $32,%2;" +#define COMPUTE_L_m16n6 \ + "movq %%r12,%%r13; sarq $2,%%r13; movq %%r14,%1;" INIT_m16n6\ + "movq %%r13,%4; movq %2,%5; cmpq $16,%%r13; jb 7116762f; movq $14,%4;"\ + "7116761:\n\t"\ + KERNEL_L_k2m16n6 "prefetcht0 128(%1); testq $24,%4; movq $84,%%r15; cmovz %3,%%r15;"\ + KERNEL_L_k2m16n6 "prefetcht1 (%5); subq $63,%5; addq %%r15,%5;"\ + KERNEL_L_k2m16n6 "prefetcht0 128(%1); prefetcht1 (%6); cmpq $198,%4; cmoveq %2,%5;"\ + KERNEL_L_k2m16n6 "addq $16,%6; addq $8,%4; cmpq %4,%%r13; jnb 7116761b;"\ + "movq %2,%5; negq %4; leaq 14(%%r13,%4,1),%4;"\ + "7116762:\n\t"\ + "xorq %%r15,%%r15; testq %4,%4; jz 7116764f;"\ + "7116763:\n\t"\ + "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5; incq %%r15;"\ + KERNEL_L_k1m16n6 "cmpq $6,%%r15; cmoveq %2,%5; decq %4; jnz 7116763b;"\ + "7116764:\n\t"\ + SAVE_L_m16n6 "addq $32,%2;" +#define COMPUTE_R_m16n6 \ + "movq %%r12,%%r13; sarq $2,%%r13; movq %%r14,%1;" INIT_m16n6\ + "movq %%r13,%4; leaq (%2,%3,4),%5; leaq (%5,%3,2),%5; movq %5,%%r10; cmpq $16,%%r13; jb 7216762f; movq $14,%4;"\ + "7216761:\n\t"\ + KERNEL_R_k2m16n6 "prefetcht0 128(%1,%%r12,8); testq $24,%4; movq $84,%%r15; cmovz %3,%%r15;"\ + KERNEL_R_k2m16n6 "prefetcht1 (%5); subq $63,%5; addq %%r15,%5;"\ + KERNEL_R_k2m16n6 "prefetcht0 128(%1,%%r12,8); prefetcht1 (%6); cmpq $198,%4; cmoveq %%r10,%5;"\ + KERNEL_R_k2m16n6 "addq $16,%6; addq $8,%4; cmpq %4,%%r13; jnb 7216761b;"\ + "movq %%r10,%5; negq %4; leaq 14(%%r13,%4,1),%4;"\ + "7216762:\n\t"\ + "xorq %%r15,%%r15; testq %4,%4; jz 7216764f;"\ + "7216763:\n\t"\ + "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5; incq %%r15;"\ + KERNEL_R_k1m16n6 "cmpq $6,%%r15; cmoveq %%r10,%5; decq %4; jnz 7216763b;"\ + "7216764:\n\t"\ + "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_R_m16n6 "addq $32,%2;" +#define COMPUTE_H_m8n6 \ + "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m8n6\ + "cmpq $8,%4; jb 718760f; movq %2,%5; xorq %%r15,%%r15;"\ + "718769:\n\t"\ + KERNEL_L_k2m8n6 KERNEL_L_k2m8n6 "cmpq $62,%%r15; movq $62,%%r15; cmoveq %3,%%r15;"\ + KERNEL_L_k2m8n6 KERNEL_L_k2m8n6 "prefetcht2 (%5); leaq -31(%5,%%r15,1),%5;"\ + "subq $8,%4; cmpq $8,%4; jnb 718769b;"\ + "718760:\n\t"\ + "testq %4,%4; jz 718762f;"\ + "718761:\n\t"\ + KERNEL_L_k1m8n6 "decq %4; jnz 718761b;"\ + "718762:\n\t"\ + SAVE_L_m8n6 "negq %%r12; leaq (%0,%%r12,8),%0; negq %%r12;" +#define COMPUTE_T_m8n6(side,sim) \ + "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m8n6\ + "cmpq $8,%4; jb 72"#sim"8760f;"\ + "72"#sim"8769:\n\t"\ + KERNEL_##side##_k2m8n6 KERNEL_##side##_k2m8n6 KERNEL_##side##_k2m8n6 KERNEL_##side##_k2m8n6\ + "subq $8,%4; cmpq $8,%4; jnb 72"#sim"8769b;"\ + "72"#sim"8760:\n\t"\ + "testq %4,%4; jz 72"#sim"8762f;"\ + "72"#sim"8761:\n\t"\ + KERNEL_##side##_k1m8n6 "decq %4; jnz 72"#sim"8761b;"\ + "72"#sim"8762:\n\t"\ + SAVE_##side##_m8n6 "addq $32,%2;" +#define COMPUTE_NORMAL(ndim) {\ + next_b = b_pointer + ndim * K;\ + __asm__ __volatile__(\ + "vbroadcastss %9,%%ymm0;"\ + "movq %8,%%r12; salq $2,%%r12; movq %1,%%r14; movq %7,%%r11;"\ + "cmpq $8,%%r11;jb 33101"#ndim"f;"\ + "33109"#ndim":\n\t"\ + COMPUTE_m8n##ndim\ + "subq $8,%%r11;cmpq $8,%%r11;jnb 33109"#ndim"b;"\ + "33101"#ndim":\n\t"\ + "cmpq $4,%%r11;jb 33103"#ndim"f;"\ + COMPUTE_SIMPLE(4,ndim) "subq $4,%%r11;"\ + "33103"#ndim":\n\t"\ + "cmpq $2,%%r11;jb 33104"#ndim"f;"\ + COMPUTE_SIMPLE(2,ndim) "subq $2,%%r11;"\ + "33104"#ndim":\n\t"\ + "testq %%r11,%%r11;jz 33105"#ndim"f;"\ + COMPUTE_SIMPLE(1,ndim)\ + "33105"#ndim":\n\t"\ + "movq %%r14,%1; vzeroupper;"\ + :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(ctemp),"+r"(next_b)\ + :"m"(M),"m"(K),"m"(ALPHA):"r10","r11","r12","r13","r14","r15",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\ + a_pointer -= M * K; b_pointer += ndim * K; c_pointer += (LDC * ndim - M);\ +} +#define COMPUTE_n12 {\ + next_b = b_pointer + 12 * K;\ + __asm__ __volatile__(\ + "vbroadcastss %9,%%ymm0;"\ + "movq %8,%%r12; salq $2,%%r12; movq %1,%%r14; movq %7,%%r11;"\ + "cmpq $16,%%r11;jb 3310112f;"\ + COMPUTE_H_m8n6\ + "3310612:\n\t"\ + COMPUTE_R_m16n6 "subq $8,%%r11; cmpq $16,%%r11;jb 3310712f;"\ + COMPUTE_L_m16n6 "subq $8,%%r11; cmpq $16,%%r11;jnb 3310612b;"\ + COMPUTE_T_m8n6(R,5) "subq $8,%%r11; jmp 3310212f;"\ + "3310712:\n\t"\ + COMPUTE_T_m8n6(L,7) "subq $8,%%r11; jmp 3310212f;"\ + "3310112:\n\t"\ + "cmpq $8,%%r11;jb 3310212f;"\ + COMPUTE_SIMPLE(8,12) "subq $8,%%r11;"\ + "3310212:\n\t"\ + "cmpq $4,%%r11;jb 3310312f;"\ + COMPUTE_SIMPLE(4,12) "subq $4,%%r11;"\ + "3310312:\n\t"\ + "cmpq $2,%%r11;jb 3310412f;"\ + COMPUTE_SIMPLE(2,12) "subq $2,%%r11;"\ + "3310412:\n\t"\ + "testq %%r11,%%r11;jz 3310512f;"\ + COMPUTE_SIMPLE(1,12)\ + "3310512:\n\t"\ + "movq %%r14,%1; vzeroupper;"\ + :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(ctemp),"+r"(next_b)\ + :"m"(M),"m"(K),"m"(ALPHA):"r10","r11","r12","r13","r14","r15",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\ + a_pointer -= M * K; b_pointer += 12 * K; c_pointer += (LDC * 12 - M);\ +} + +#include "common.h" +#include +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC){ + if(m==0||n==0||k==0||alpha==(float)0.0) return 0; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float); + float ALPHA = alpha; + int64_t M = (int64_t)m, K = (int64_t)k, k_count = 0; + BLASLONG n_count = n; + float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B; + for(;n_count>11;n_count-=12) COMPUTE_n12 + for(;n_count>7;n_count-=8) COMPUTE_NORMAL(8) + for(;n_count>3;n_count-=4) COMPUTE_NORMAL(4) + for(;n_count>1;n_count-=2) COMPUTE_NORMAL(2) + if(n_count>0) COMPUTE_NORMAL(1) + return 0; +} + From f1746e7284e0ac00c45937087547cf2d43823968 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 22 Feb 2020 23:38:48 +0800 Subject: [PATCH 49/75] Delete sgemm_kernel_8x4_haswell_2.c --- sgemm_kernel_8x4_haswell_2.c | 424 ----------------------------------- 1 file changed, 424 deletions(-) delete mode 100644 sgemm_kernel_8x4_haswell_2.c diff --git a/sgemm_kernel_8x4_haswell_2.c b/sgemm_kernel_8x4_haswell_2.c deleted file mode 100644 index 5ab3e6d1f..000000000 --- a/sgemm_kernel_8x4_haswell_2.c +++ /dev/null @@ -1,424 +0,0 @@ -/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */ -/* r10 = tmp, r11 = m_counter, r12 = k << 2(const), r13 = tmp, r14 = b_head_pos(const), r15 = tmp */ - -/* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */ -#define KERNEL_k1m8n1 \ - "vmovups (%0),%%ymm1; addq $32,%0;"\ - "vbroadcastss (%1),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\ - "addq $4,%1;" -#define KERNEL_h_k1m8n2 \ - "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ - "vbroadcastsd (%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;" -#define KERNEL_k1m8n2 KERNEL_h_k1m8n2 "addq $8,%1;" -#define KERNEL_h_k1m8n4 \ - KERNEL_h_k1m8n2 "vbroadcastsd 8(%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;" -#define KERNEL_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;" -#define unit_kernel_k1m8n4(c1,c2,c3,c4,boff,...) \ - "vbroadcastsd "#boff"("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\ - "vbroadcastsd "#boff"+8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";" -#define KERNEL_h_k1m8n8 KERNEL_h_k1m8n4 unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,%1,%%r12,4) -#define KERNEL_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%1;" -#define KERNEL_h_k1m8n12 KERNEL_h_k1m8n8 unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,%1,%%r12,8) -#define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%1;" -#define KERNEL_k2m8n4 \ - "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0);"\ - unit_kernel_k1m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,0,%1)\ - "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; addq $64,%0;"\ - unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,16,%1)\ - "addq $32,%1;" -#define KERNEL_L_k1m8n6 \ - "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0); addq $32,%0;"\ - "vbroadcastsd (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ - "vbroadcastsd 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ - "vbroadcastsd (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ - "addq $16,%1;" -#define KERNEL_L_k2m8n6 \ - "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0);"\ - "vbroadcastsd (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ - "vbroadcastsd 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ - "vbroadcastsd (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ - "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; addq $64,%0;"\ - "vbroadcastsd 16(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ - "vbroadcastsd 24(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ - "vbroadcastsd 16(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ - "addq $32,%1;" -#define KERNEL_L_k1m16n6 \ - "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8); addq $32,%0;"\ - "vbroadcastss (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ - "vbroadcastss 4(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ - "vbroadcastss 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ - "vbroadcastss 12(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ - "vbroadcastss (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ - "vbroadcastss 4(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ - "addq $16,%1;" -#define KERNEL_L_k2m16n6 \ - "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8);"\ - "vbroadcastss (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ - "vbroadcastss 4(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ - "vbroadcastss 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ - "vbroadcastss 12(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ - "vbroadcastss (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ - "vbroadcastss 4(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ - "vmovups 32(%0),%%ymm1; vmovups 32(%0,%%r12,8),%%ymm2; addq $64,%0;"\ - "vbroadcastss 16(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ - "vbroadcastss 20(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ - "vbroadcastss 24(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ - "vbroadcastss 28(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ - "vbroadcastss 16(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ - "vbroadcastss 20(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ - "addq $32,%1;" -#define KERNEL_R_k1m16n6 \ - "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8); addq $32,%0;"\ - "vbroadcastss 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ - "vbroadcastss 12(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ - "vbroadcastss (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ - "vbroadcastss 4(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ - "vbroadcastss 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ - "vbroadcastss 12(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ - "addq $16,%1;" -#define KERNEL_R_k2m16n6 \ - "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8);"\ - "vbroadcastss 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ - "vbroadcastss 12(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ - "vbroadcastss (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ - "vbroadcastss 4(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ - "vbroadcastss 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ - "vbroadcastss 12(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ - "vmovups 32(%0),%%ymm1; vmovups 32(%0,%%r12,8),%%ymm2; addq $64,%0;"\ - "vbroadcastss 24(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ - "vbroadcastss 28(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ - "vbroadcastss 16(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ - "vbroadcastss 20(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ - "vbroadcastss 24(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ - "vbroadcastss 28(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ - "addq $32,%1;" -#define KERNEL_R_k1m8n6 \ - "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0); addq $32,%0;"\ - "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ - "vbroadcastsd (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ - "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ - "addq $16,%1;" -#define KERNEL_R_k2m8n6 \ - "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0);"\ - "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ - "vbroadcastsd (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ - "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ - "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; addq $64,%0;"\ - "vbroadcastsd 24(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ - "vbroadcastsd 16(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ - "vbroadcastsd 24(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ - "addq $32,%1;" -#define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;" -#define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;" -#define unit_init_m8n4(c1,c2,c3,c4) \ - "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" -#define INIT_m8n8 unit_init_m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7) unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11) -#define INIT_m8n4 INIT_m8n8 -#define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15) -#define INIT_m8n6 INIT_m8n12 -#define INIT_m16n6 INIT_m8n12 -#define SAVE_m8n1 "vfmadd213ps (%2),%%ymm0,%%ymm4; vmovups %%ymm4,(%2);" -#define unit_save_m8n2(c1,c2) \ - "vunpcklps "#c2","#c1",%%ymm2; vunpckhps "#c2","#c1",%%ymm3; vunpcklpd %%ymm3,%%ymm2,"#c1"; vunpckhpd %%ymm3,%%ymm2,"#c2";"\ - "vfmadd213ps (%5),%%ymm0,"#c1"; vfmadd213ps (%5,%3,1),%%ymm0,"#c2"; vmovups "#c1",(%5); vmovups "#c2",(%5,%3,1); leaq (%5,%3,2),%5;" -#define SAVE_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5) -#define SAVE_m8n4 "movq %2,%5;"\ - "vaddps %%ymm4,%%ymm8,%%ymm4; vaddps %%ymm5,%%ymm9,%%ymm5; vaddps %%ymm6,%%ymm10,%%ymm6; vaddps %%ymm7,%%ymm11,%%ymm7;"\ - unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) -#define SAVE_m8n8 "movq %2,%5;"\ - unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11) -#define SAVE_m8n12 SAVE_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15) -#define unit_save_m16n2(c1,c2,c3,c4) \ - "vfmadd213ps (%5),%%ymm0,"#c1"; vfmadd213ps 32(%5),%%ymm0,"#c2"; vmovups "#c1",(%5); vmovups "#c2",32(%5);"\ - "vfmadd213ps (%5,%3,1),%%ymm0,"#c3"; vfmadd213ps 32(%5,%3,1),%%ymm0,"#c4"; vmovups "#c3",(%5,%3,1); vmovups "#c4",32(%5,%3,1); leaq (%5,%3,2),%5;" -#define SAVE_L_m16n6 "movq %2,%5;"\ - unit_save_m16n2(%%ymm4,%%ymm5,%%ymm6,%%ymm7) unit_save_m16n2(%%ymm8,%%ymm9,%%ymm10,%%ymm11) unit_save_m16n2(%%ymm12,%%ymm13,%%ymm14,%%ymm15) -#define SAVE_R_m16n6 "leaq (%2,%3,4),%5; leaq (%5,%3,2),%5;"\ - unit_save_m16n2(%%ymm4,%%ymm5,%%ymm6,%%ymm7) unit_save_m16n2(%%ymm8,%%ymm9,%%ymm10,%%ymm11) unit_save_m16n2(%%ymm12,%%ymm13,%%ymm14,%%ymm15) -#define SAVE_L_m8n6 "movq %2,%5;"\ - "vaddps %%ymm4,%%ymm10,%%ymm4; vaddps %%ymm5,%%ymm11,%%ymm5; vaddps %%ymm6,%%ymm12,%%ymm6;"\ - "vaddps %%ymm7,%%ymm13,%%ymm7; vaddps %%ymm8,%%ymm14,%%ymm8; vaddps %%ymm9,%%ymm15,%%ymm9;"\ - unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) unit_save_m8n2(%%ymm8,%%ymm9) -#define SAVE_R_m8n6 "leaq (%2,%3,4),%5; leaq (%5,%3,2),%5;"\ - "vaddps %%ymm4,%%ymm10,%%ymm4; vaddps %%ymm5,%%ymm11,%%ymm5; vaddps %%ymm6,%%ymm12,%%ymm6;"\ - "vaddps %%ymm7,%%ymm13,%%ymm7; vaddps %%ymm8,%%ymm14,%%ymm8; vaddps %%ymm9,%%ymm15,%%ymm9;"\ - unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) unit_save_m8n2(%%ymm8,%%ymm9) - -/* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */ -#define KERNEL_k1m4n1 \ - "vmovups (%0),%%xmm1; addq $16,%0;"\ - "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ - "addq $4,%1;" -#define KERNEL_h_k1m4n2 \ - "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\ - "vmovddup (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;" -#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $8,%1;" -#define KERNEL_h_k1m4n4 \ - KERNEL_h_k1m4n2 "vmovddup 8(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;" -#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;" -#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \ - "vmovddup ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\ - "vmovddup 8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";" -#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,%1,%%r12,4) -#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%1;" -#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,%1,%%r12,8) -#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%1;" -#define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" -#define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" -#define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;" -#define unit_init_m4n4(c1,c2,c3,c4) \ - "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" -#define INIT_m4n8 INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11) -#define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15) -#define SAVE_m4n1 "vfmadd213ps (%2),%%xmm0,%%xmm4; vmovups %%xmm4,(%2);" -#define unit_save_m4n2(c1,c2) \ - "vunpcklps "#c2","#c1",%%xmm2; vunpckhps "#c2","#c1",%%xmm3; vunpcklpd %%xmm3,%%xmm2,"#c1"; vunpckhpd %%xmm3,%%xmm2,"#c2";"\ - "vfmadd213ps (%5),%%xmm0,"#c1"; vmovups "#c1",(%5);"\ - "vfmadd213ps (%5,%3,1),%%xmm0,"#c2"; vmovups "#c2",(%5,%3,1);"\ - "leaq (%5,%3,2),%5;" -#define SAVE_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5) -#define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(%%xmm6,%%xmm7) -#define SAVE_m4n8 SAVE_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11) -#define SAVE_m4n12 SAVE_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15) - -/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */ -#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" -#define KERNEL_k1m2n1 \ - "vmovsd (%0),%%xmm1; addq $8,%0;"\ - "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ - "addq $4,%1;" -#define SAVE_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" -#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" -#define KERNEL_k1m2n2 \ - "vmovsd (%0),%%xmm1; addq $8,%0;"\ - "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ - "vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\ - "addq $8,%1;" -#define SAVE_m2n2 SAVE_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);" -#define INIT_m2n4 INIT_m2n2 -#define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;" -#define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;" -#define KERNEL_k1m2n4 \ - "vmovups (%1),%%xmm3; addq $16,%1;"\ - "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ - "vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ - "addq $8,%0;" -#define KERNEL_k1m2n8 \ - "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; addq $16,%1;"\ - "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\ - "vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\ - "addq $8,%0;" -#define KERNEL_k1m2n12 \ - "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; vmovups (%1,%%r12,8),%%xmm1; addq $16,%1;"\ - "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\ - "vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\ - "addq $8,%0;" -#define unit_save_m2n4(c1,c2) \ - "vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\ - "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ - "vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"\ - "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ - "vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;" -#define SAVE_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5) -#define SAVE_m2n8 SAVE_m2n4 unit_save_m2n4(%%xmm6,%%xmm7) -#define SAVE_m2n12 SAVE_m2n8 unit_save_m2n4(%%xmm8,%%xmm9) - -/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm6 for accumulators */ -#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" -#define KERNEL_k1m1n1 \ - "vmovss (%1),%%xmm3; addq $4,%1;"\ - "vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\ - "addq $4,%0;" -#define SAVE_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);" -#define INIT_m1n2 INIT_m1n1 -#define KERNEL_k1m1n2 \ - "vmovsd (%1),%%xmm3; addq $8,%1;"\ - "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ - "addq $4,%0;" -#define SAVE_m1n2 \ - "vmovss (%2),%%xmm3; vinsertps $16,(%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\ - "vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);" -#define INIT_m1n4 INIT_m1n2 -#define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;" -#define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;" -#define KERNEL_k1m1n4 \ - "vmovups (%1),%%xmm3; addq $16,%1;"\ - "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ - "addq $4,%0;" -#define KERNEL_k1m1n8 \ - "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; addq $16,%1;"\ - "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\ - "addq $4,%0;" -#define KERNEL_k1m1n12 \ - "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; vmovups (%1,%%r12,8),%%xmm1; addq $16,%1;"\ - "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\ - "addq $4,%0;" -#define unit_save_m1n4(c1) \ - "vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\ - "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ - "vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\ - "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ - "vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;" -#define SAVE_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4) -#define SAVE_m1n8 SAVE_m1n4 unit_save_m1n4(%%xmm5) -#define SAVE_m1n12 SAVE_m1n8 unit_save_m1n4(%%xmm6) - -/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */ -/* r10 = tmp, r11 = m_counter, r12 = k << 2(const), r13 = tmp, r14 = b_head_pos(const), r15 = tmp */ - -#define COMPUTE_SIMPLE(mdim,ndim) \ - "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m##mdim##n##ndim\ - "testq %4,%4; jz 7"#mdim"7"#ndim"2f;"\ - "7"#mdim"7"#ndim"1:\n\t"\ - KERNEL_k1m##mdim##n##ndim "decq %4; jnz 7"#mdim"7"#ndim"1b;"\ - "7"#mdim"7"#ndim"2:\n\t"\ - SAVE_m##mdim##n##ndim "addq $"#mdim"*4,%2;" -#define COMPUTE_m8n1 COMPUTE_SIMPLE(8,1) -#define COMPUTE_m8n2 COMPUTE_SIMPLE(8,2) -#define COMPUTE_m8n8 COMPUTE_SIMPLE(8,8) -#define COMPUTE_m8n12 COMPUTE_SIMPLE(8,12) -#define COMPUTE_m8n4 \ - "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m8n4\ - "cmpq $8,%4; jb 78740f;"\ - "78749:\n\t"\ - KERNEL_k2m8n4 KERNEL_k2m8n4 KERNEL_k2m8n4 KERNEL_k2m8n4\ - "subq $8,%4; cmpq $8,%4; jnb 78749b;"\ - "78740:\n\t"\ - "testq %4,%4; jz 78742f;"\ - "78741:\n\t"\ - KERNEL_k1m8n4 "decq %4; jnz 78741b;"\ - "78742:\n\t"\ - SAVE_m8n4 "addq $32,%2;" -#define COMPUTE_L_m16n6 \ - "movq %%r12,%%r13; sarq $2,%%r13; movq %%r14,%1;" INIT_m16n6\ - "movq %%r13,%4; movq %2,%5; cmpq $16,%%r13; jb 7116762f; movq $14,%4;"\ - "7116761:\n\t"\ - KERNEL_L_k2m16n6 "prefetcht0 128(%1); testq $24,%4; movq $84,%%r15; cmovz %3,%%r15;"\ - KERNEL_L_k2m16n6 "prefetcht1 (%5); subq $63,%5; addq %%r15,%5;"\ - KERNEL_L_k2m16n6 "prefetcht0 128(%1); prefetcht1 (%6); cmpq $198,%4; cmoveq %2,%5;"\ - KERNEL_L_k2m16n6 "addq $16,%6; addq $8,%4; cmpq %4,%%r13; jnb 7116761b;"\ - "movq %2,%5; negq %4; leaq 14(%%r13,%4,1),%4;"\ - "7116762:\n\t"\ - "xorq %%r15,%%r15; testq %4,%4; jz 7116764f;"\ - "7116763:\n\t"\ - "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5; incq %%r15;"\ - KERNEL_L_k1m16n6 "cmpq $6,%%r15; cmoveq %2,%5; decq %4; jnz 7116763b;"\ - "7116764:\n\t"\ - SAVE_L_m16n6 "addq $32,%2;" -#define COMPUTE_R_m16n6 \ - "movq %%r12,%%r13; sarq $2,%%r13; movq %%r14,%1;" INIT_m16n6\ - "movq %%r13,%4; leaq (%2,%3,4),%5; leaq (%5,%3,2),%5; movq %5,%%r10; cmpq $16,%%r13; jb 7216762f; movq $14,%4;"\ - "7216761:\n\t"\ - KERNEL_R_k2m16n6 "prefetcht0 128(%1,%%r12,8); testq $24,%4; movq $84,%%r15; cmovz %3,%%r15;"\ - KERNEL_R_k2m16n6 "prefetcht1 (%5); subq $63,%5; addq %%r15,%5;"\ - KERNEL_R_k2m16n6 "prefetcht0 128(%1,%%r12,8); prefetcht1 (%6); cmpq $198,%4; cmoveq %%r10,%5;"\ - KERNEL_R_k2m16n6 "addq $16,%6; addq $8,%4; cmpq %4,%%r13; jnb 7216761b;"\ - "movq %%r10,%5; negq %4; leaq 14(%%r13,%4,1),%4;"\ - "7216762:\n\t"\ - "xorq %%r15,%%r15; testq %4,%4; jz 7216764f;"\ - "7216763:\n\t"\ - "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5; incq %%r15;"\ - KERNEL_R_k1m16n6 "cmpq $6,%%r15; cmoveq %%r10,%5; decq %4; jnz 7216763b;"\ - "7216764:\n\t"\ - "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_R_m16n6 "addq $32,%2;" -#define COMPUTE_H_m8n6 \ - "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m8n6\ - "cmpq $8,%4; jb 718760f; movq %2,%5; xorq %%r15,%%r15;"\ - "718769:\n\t"\ - KERNEL_L_k2m8n6 KERNEL_L_k2m8n6 "cmpq $62,%%r15; movq $62,%%r15; cmoveq %3,%%r15;"\ - KERNEL_L_k2m8n6 KERNEL_L_k2m8n6 "prefetcht2 (%5); leaq -31(%5,%%r15,1),%5;"\ - "subq $8,%4; cmpq $8,%4; jnb 718769b;"\ - "718760:\n\t"\ - "testq %4,%4; jz 718762f;"\ - "718761:\n\t"\ - KERNEL_L_k1m8n6 "decq %4; jnz 718761b;"\ - "718762:\n\t"\ - SAVE_L_m8n6 "negq %%r12; leaq (%0,%%r12,8),%0; negq %%r12;" -#define COMPUTE_T_m8n6(side,sim) \ - "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m8n6\ - "cmpq $8,%4; jb 72"#sim"8760f;"\ - "72"#sim"8769:\n\t"\ - KERNEL_##side##_k2m8n6 KERNEL_##side##_k2m8n6 KERNEL_##side##_k2m8n6 KERNEL_##side##_k2m8n6\ - "subq $8,%4; cmpq $8,%4; jnb 72"#sim"8769b;"\ - "72"#sim"8760:\n\t"\ - "testq %4,%4; jz 72"#sim"8762f;"\ - "72"#sim"8761:\n\t"\ - KERNEL_##side##_k1m8n6 "decq %4; jnz 72"#sim"8761b;"\ - "72"#sim"8762:\n\t"\ - SAVE_##side##_m8n6 "addq $32,%2;" -#define COMPUTE_NORMAL(ndim) {\ - next_b = b_pointer + ndim * K;\ - __asm__ __volatile__(\ - "vbroadcastss %9,%%ymm0;"\ - "movq %8,%%r12; salq $2,%%r12; movq %1,%%r14; movq %7,%%r11;"\ - "cmpq $8,%%r11;jb 33101"#ndim"f;"\ - "33109"#ndim":\n\t"\ - COMPUTE_m8n##ndim\ - "subq $8,%%r11;cmpq $8,%%r11;jnb 33109"#ndim"b;"\ - "33101"#ndim":\n\t"\ - "cmpq $4,%%r11;jb 33103"#ndim"f;"\ - COMPUTE_SIMPLE(4,ndim) "subq $4,%%r11;"\ - "33103"#ndim":\n\t"\ - "cmpq $2,%%r11;jb 33104"#ndim"f;"\ - COMPUTE_SIMPLE(2,ndim) "subq $2,%%r11;"\ - "33104"#ndim":\n\t"\ - "testq %%r11,%%r11;jz 33105"#ndim"f;"\ - COMPUTE_SIMPLE(1,ndim)\ - "33105"#ndim":\n\t"\ - "movq %%r14,%1; vzeroupper;"\ - :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(ctemp),"+r"(next_b)\ - :"m"(M),"m"(K),"m"(ALPHA):"r10","r11","r12","r13","r14","r15",\ - "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\ - a_pointer -= M * K; b_pointer += ndim * K; c_pointer += (LDC * ndim - M);\ -} -#define COMPUTE_n12 {\ - next_b = b_pointer + 12 * K;\ - __asm__ __volatile__(\ - "vbroadcastss %9,%%ymm0;"\ - "movq %8,%%r12; salq $2,%%r12; movq %1,%%r14; movq %7,%%r11;"\ - "cmpq $16,%%r11;jb 3310112f;"\ - COMPUTE_H_m8n6\ - "3310612:\n\t"\ - COMPUTE_R_m16n6 "subq $8,%%r11; cmpq $16,%%r11;jb 3310712f;"\ - COMPUTE_L_m16n6 "subq $8,%%r11; cmpq $16,%%r11;jnb 3310612b;"\ - COMPUTE_T_m8n6(R,5) "subq $8,%%r11; jmp 3310212f;"\ - "3310712:\n\t"\ - COMPUTE_T_m8n6(L,7) "subq $8,%%r11; jmp 3310212f;"\ - "3310112:\n\t"\ - "cmpq $8,%%r11;jb 3310212f;"\ - COMPUTE_SIMPLE(8,12) "subq $8,%%r11;"\ - "3310212:\n\t"\ - "cmpq $4,%%r11;jb 3310312f;"\ - COMPUTE_SIMPLE(4,12) "subq $4,%%r11;"\ - "3310312:\n\t"\ - "cmpq $2,%%r11;jb 3310412f;"\ - COMPUTE_SIMPLE(2,12) "subq $2,%%r11;"\ - "3310412:\n\t"\ - "testq %%r11,%%r11;jz 3310512f;"\ - COMPUTE_SIMPLE(1,12)\ - "3310512:\n\t"\ - "movq %%r14,%1; vzeroupper;"\ - :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(ctemp),"+r"(next_b)\ - :"m"(M),"m"(K),"m"(ALPHA):"r10","r11","r12","r13","r14","r15",\ - "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\ - a_pointer -= M * K; b_pointer += 12 * K; c_pointer += (LDC * 12 - M);\ -} - -#include "common.h" -#include -int __attribute__ ((noinline)) -CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC){ - if(m==0||n==0||k==0||alpha==(float)0.0) return 0; - int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float); - float ALPHA = alpha; - int64_t M = (int64_t)m, K = (int64_t)k, k_count = 0; - BLASLONG n_count = n; - float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B; - for(;n_count>11;n_count-=12) COMPUTE_n12 - for(;n_count>7;n_count-=8) COMPUTE_NORMAL(8) - for(;n_count>3;n_count-=4) COMPUTE_NORMAL(4) - for(;n_count>1;n_count-=2) COMPUTE_NORMAL(2) - if(n_count>0) COMPUTE_NORMAL(1) - return 0; -} - From 97a32cb0a52b159d547b0c41d42b18854c365ec9 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 22 Feb 2020 23:39:20 +0800 Subject: [PATCH 50/75] Update KERNEL.HASWELL --- kernel/x86_64/KERNEL.HASWELL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index d24b7f3b3..f6ca5c2d5 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -32,7 +32,7 @@ CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c STRMMKERNEL = sgemm_kernel_8x4_haswell.c -SGEMMKERNEL = sgemm_kernel_8x4_haswell.c +SGEMMKERNEL = sgemm_kernel_8x4_haswell_2.c SGEMM_BETA = sgemm_beta_skylakex.c SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c From a2ff577a3005cd5f028705e567b08f6cbd65534c Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 22 Feb 2020 23:39:43 +0800 Subject: [PATCH 51/75] Update KERNEL.ZEN --- kernel/x86_64/KERNEL.ZEN | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.ZEN b/kernel/x86_64/KERNEL.ZEN index 7cec2e5ed..1cd02db74 100644 --- a/kernel/x86_64/KERNEL.ZEN +++ b/kernel/x86_64/KERNEL.ZEN @@ -31,7 +31,7 @@ CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c STRMMKERNEL = sgemm_kernel_8x4_haswell.c -SGEMMKERNEL = sgemm_kernel_8x4_haswell.c +SGEMMKERNEL = sgemm_kernel_8x4_haswell_2.c SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c From 903854c168cc438c7d154fefb25a639752674242 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 22 Feb 2020 23:40:02 +0800 Subject: [PATCH 52/75] Add files via upload --- kernel/x86_64/sgemm_kernel_8x4_haswell_2.c | 424 +++++++++++++++++++++ 1 file changed, 424 insertions(+) create mode 100644 kernel/x86_64/sgemm_kernel_8x4_haswell_2.c diff --git a/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c b/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c new file mode 100644 index 000000000..5ab3e6d1f --- /dev/null +++ b/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c @@ -0,0 +1,424 @@ +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */ +/* r10 = tmp, r11 = m_counter, r12 = k << 2(const), r13 = tmp, r14 = b_head_pos(const), r15 = tmp */ + +/* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */ +#define KERNEL_k1m8n1 \ + "vmovups (%0),%%ymm1; addq $32,%0;"\ + "vbroadcastss (%1),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\ + "addq $4,%1;" +#define KERNEL_h_k1m8n2 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ + "vbroadcastsd (%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;" +#define KERNEL_k1m8n2 KERNEL_h_k1m8n2 "addq $8,%1;" +#define KERNEL_h_k1m8n4 \ + KERNEL_h_k1m8n2 "vbroadcastsd 8(%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;" +#define KERNEL_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;" +#define unit_kernel_k1m8n4(c1,c2,c3,c4,boff,...) \ + "vbroadcastsd "#boff"("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\ + "vbroadcastsd "#boff"+8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";" +#define KERNEL_h_k1m8n8 KERNEL_h_k1m8n4 unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,%1,%%r12,4) +#define KERNEL_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%1;" +#define KERNEL_h_k1m8n12 KERNEL_h_k1m8n8 unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,%1,%%r12,8) +#define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%1;" +#define KERNEL_k2m8n4 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0);"\ + unit_kernel_k1m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,0,%1)\ + "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; addq $64,%0;"\ + unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,16,%1)\ + "addq $32,%1;" +#define KERNEL_L_k1m8n6 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0); addq $32,%0;"\ + "vbroadcastsd (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastsd 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastsd (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "addq $16,%1;" +#define KERNEL_L_k2m8n6 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0);"\ + "vbroadcastsd (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastsd 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastsd (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; addq $64,%0;"\ + "vbroadcastsd 16(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastsd 24(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastsd 16(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $32,%1;" +#define KERNEL_L_k1m16n6 \ + "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8); addq $32,%0;"\ + "vbroadcastss (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 4(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 12(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 4(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $16,%1;" +#define KERNEL_L_k2m16n6 \ + "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8);"\ + "vbroadcastss (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 4(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 12(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 4(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "vmovups 32(%0),%%ymm1; vmovups 32(%0,%%r12,8),%%ymm2; addq $64,%0;"\ + "vbroadcastss 16(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 20(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss 24(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 28(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss 16(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 20(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $32,%1;" +#define KERNEL_R_k1m16n6 \ + "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8); addq $32,%0;"\ + "vbroadcastss 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 12(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 4(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 12(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $16,%1;" +#define KERNEL_R_k2m16n6 \ + "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8);"\ + "vbroadcastss 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 12(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 4(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 12(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "vmovups 32(%0),%%ymm1; vmovups 32(%0,%%r12,8),%%ymm2; addq $64,%0;"\ + "vbroadcastss 24(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 28(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss 16(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 20(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss 24(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 28(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $32,%1;" +#define KERNEL_R_k1m8n6 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0); addq $32,%0;"\ + "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastsd (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "addq $16,%1;" +#define KERNEL_R_k2m8n6 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0);"\ + "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastsd (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; addq $64,%0;"\ + "vbroadcastsd 24(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastsd 16(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastsd 24(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $32,%1;" +#define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;" +#define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;" +#define unit_init_m8n4(c1,c2,c3,c4) \ + "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" +#define INIT_m8n8 unit_init_m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7) unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11) +#define INIT_m8n4 INIT_m8n8 +#define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15) +#define INIT_m8n6 INIT_m8n12 +#define INIT_m16n6 INIT_m8n12 +#define SAVE_m8n1 "vfmadd213ps (%2),%%ymm0,%%ymm4; vmovups %%ymm4,(%2);" +#define unit_save_m8n2(c1,c2) \ + "vunpcklps "#c2","#c1",%%ymm2; vunpckhps "#c2","#c1",%%ymm3; vunpcklpd %%ymm3,%%ymm2,"#c1"; vunpckhpd %%ymm3,%%ymm2,"#c2";"\ + "vfmadd213ps (%5),%%ymm0,"#c1"; vfmadd213ps (%5,%3,1),%%ymm0,"#c2"; vmovups "#c1",(%5); vmovups "#c2",(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5) +#define SAVE_m8n4 "movq %2,%5;"\ + "vaddps %%ymm4,%%ymm8,%%ymm4; vaddps %%ymm5,%%ymm9,%%ymm5; vaddps %%ymm6,%%ymm10,%%ymm6; vaddps %%ymm7,%%ymm11,%%ymm7;"\ + unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) +#define SAVE_m8n8 "movq %2,%5;"\ + unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11) +#define SAVE_m8n12 SAVE_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15) +#define unit_save_m16n2(c1,c2,c3,c4) \ + "vfmadd213ps (%5),%%ymm0,"#c1"; vfmadd213ps 32(%5),%%ymm0,"#c2"; vmovups "#c1",(%5); vmovups "#c2",32(%5);"\ + "vfmadd213ps (%5,%3,1),%%ymm0,"#c3"; vfmadd213ps 32(%5,%3,1),%%ymm0,"#c4"; vmovups "#c3",(%5,%3,1); vmovups "#c4",32(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_L_m16n6 "movq %2,%5;"\ + unit_save_m16n2(%%ymm4,%%ymm5,%%ymm6,%%ymm7) unit_save_m16n2(%%ymm8,%%ymm9,%%ymm10,%%ymm11) unit_save_m16n2(%%ymm12,%%ymm13,%%ymm14,%%ymm15) +#define SAVE_R_m16n6 "leaq (%2,%3,4),%5; leaq (%5,%3,2),%5;"\ + unit_save_m16n2(%%ymm4,%%ymm5,%%ymm6,%%ymm7) unit_save_m16n2(%%ymm8,%%ymm9,%%ymm10,%%ymm11) unit_save_m16n2(%%ymm12,%%ymm13,%%ymm14,%%ymm15) +#define SAVE_L_m8n6 "movq %2,%5;"\ + "vaddps %%ymm4,%%ymm10,%%ymm4; vaddps %%ymm5,%%ymm11,%%ymm5; vaddps %%ymm6,%%ymm12,%%ymm6;"\ + "vaddps %%ymm7,%%ymm13,%%ymm7; vaddps %%ymm8,%%ymm14,%%ymm8; vaddps %%ymm9,%%ymm15,%%ymm9;"\ + unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) unit_save_m8n2(%%ymm8,%%ymm9) +#define SAVE_R_m8n6 "leaq (%2,%3,4),%5; leaq (%5,%3,2),%5;"\ + "vaddps %%ymm4,%%ymm10,%%ymm4; vaddps %%ymm5,%%ymm11,%%ymm5; vaddps %%ymm6,%%ymm12,%%ymm6;"\ + "vaddps %%ymm7,%%ymm13,%%ymm7; vaddps %%ymm8,%%ymm14,%%ymm8; vaddps %%ymm9,%%ymm15,%%ymm9;"\ + unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) unit_save_m8n2(%%ymm8,%%ymm9) + +/* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */ +#define KERNEL_k1m4n1 \ + "vmovups (%0),%%xmm1; addq $16,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "addq $4,%1;" +#define KERNEL_h_k1m4n2 \ + "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\ + "vmovddup (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;" +#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $8,%1;" +#define KERNEL_h_k1m4n4 \ + KERNEL_h_k1m4n2 "vmovddup 8(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;" +#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;" +#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \ + "vmovddup ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\ + "vmovddup 8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";" +#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,%1,%%r12,4) +#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%1;" +#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,%1,%%r12,8) +#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%1;" +#define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;" +#define unit_init_m4n4(c1,c2,c3,c4) \ + "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" +#define INIT_m4n8 INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11) +#define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15) +#define SAVE_m4n1 "vfmadd213ps (%2),%%xmm0,%%xmm4; vmovups %%xmm4,(%2);" +#define unit_save_m4n2(c1,c2) \ + "vunpcklps "#c2","#c1",%%xmm2; vunpckhps "#c2","#c1",%%xmm3; vunpcklpd %%xmm3,%%xmm2,"#c1"; vunpckhpd %%xmm3,%%xmm2,"#c2";"\ + "vfmadd213ps (%5),%%xmm0,"#c1"; vmovups "#c1",(%5);"\ + "vfmadd213ps (%5,%3,1),%%xmm0,"#c2"; vmovups "#c2",(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#define SAVE_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5) +#define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(%%xmm6,%%xmm7) +#define SAVE_m4n8 SAVE_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11) +#define SAVE_m4n12 SAVE_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15) + +/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */ +#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define KERNEL_k1m2n1 \ + "vmovsd (%0),%%xmm1; addq $8,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "addq $4,%1;" +#define SAVE_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" +#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define KERNEL_k1m2n2 \ + "vmovsd (%0),%%xmm1; addq $8,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\ + "addq $8,%1;" +#define SAVE_m2n2 SAVE_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);" +#define INIT_m2n4 INIT_m2n2 +#define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;" +#define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;" +#define KERNEL_k1m2n4 \ + "vmovups (%1),%%xmm3; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ + "addq $8,%0;" +#define KERNEL_k1m2n8 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\ + "vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\ + "addq $8,%0;" +#define KERNEL_k1m2n12 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; vmovups (%1,%%r12,8),%%xmm1; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\ + "vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\ + "addq $8,%0;" +#define unit_save_m2n4(c1,c2) \ + "vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\ + "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ + "vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"\ + "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ + "vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5) +#define SAVE_m2n8 SAVE_m2n4 unit_save_m2n4(%%xmm6,%%xmm7) +#define SAVE_m2n12 SAVE_m2n8 unit_save_m2n4(%%xmm8,%%xmm9) + +/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm6 for accumulators */ +#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define KERNEL_k1m1n1 \ + "vmovss (%1),%%xmm3; addq $4,%1;"\ + "vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define SAVE_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);" +#define INIT_m1n2 INIT_m1n1 +#define KERNEL_k1m1n2 \ + "vmovsd (%1),%%xmm3; addq $8,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define SAVE_m1n2 \ + "vmovss (%2),%%xmm3; vinsertps $16,(%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\ + "vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);" +#define INIT_m1n4 INIT_m1n2 +#define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;" +#define KERNEL_k1m1n4 \ + "vmovups (%1),%%xmm3; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define KERNEL_k1m1n8 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\ + "addq $4,%0;" +#define KERNEL_k1m1n12 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; vmovups (%1,%%r12,8),%%xmm1; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\ + "addq $4,%0;" +#define unit_save_m1n4(c1) \ + "vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\ + "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ + "vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\ + "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ + "vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4) +#define SAVE_m1n8 SAVE_m1n4 unit_save_m1n4(%%xmm5) +#define SAVE_m1n12 SAVE_m1n8 unit_save_m1n4(%%xmm6) + +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */ +/* r10 = tmp, r11 = m_counter, r12 = k << 2(const), r13 = tmp, r14 = b_head_pos(const), r15 = tmp */ + +#define COMPUTE_SIMPLE(mdim,ndim) \ + "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m##mdim##n##ndim\ + "testq %4,%4; jz 7"#mdim"7"#ndim"2f;"\ + "7"#mdim"7"#ndim"1:\n\t"\ + KERNEL_k1m##mdim##n##ndim "decq %4; jnz 7"#mdim"7"#ndim"1b;"\ + "7"#mdim"7"#ndim"2:\n\t"\ + SAVE_m##mdim##n##ndim "addq $"#mdim"*4,%2;" +#define COMPUTE_m8n1 COMPUTE_SIMPLE(8,1) +#define COMPUTE_m8n2 COMPUTE_SIMPLE(8,2) +#define COMPUTE_m8n8 COMPUTE_SIMPLE(8,8) +#define COMPUTE_m8n12 COMPUTE_SIMPLE(8,12) +#define COMPUTE_m8n4 \ + "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m8n4\ + "cmpq $8,%4; jb 78740f;"\ + "78749:\n\t"\ + KERNEL_k2m8n4 KERNEL_k2m8n4 KERNEL_k2m8n4 KERNEL_k2m8n4\ + "subq $8,%4; cmpq $8,%4; jnb 78749b;"\ + "78740:\n\t"\ + "testq %4,%4; jz 78742f;"\ + "78741:\n\t"\ + KERNEL_k1m8n4 "decq %4; jnz 78741b;"\ + "78742:\n\t"\ + SAVE_m8n4 "addq $32,%2;" +#define COMPUTE_L_m16n6 \ + "movq %%r12,%%r13; sarq $2,%%r13; movq %%r14,%1;" INIT_m16n6\ + "movq %%r13,%4; movq %2,%5; cmpq $16,%%r13; jb 7116762f; movq $14,%4;"\ + "7116761:\n\t"\ + KERNEL_L_k2m16n6 "prefetcht0 128(%1); testq $24,%4; movq $84,%%r15; cmovz %3,%%r15;"\ + KERNEL_L_k2m16n6 "prefetcht1 (%5); subq $63,%5; addq %%r15,%5;"\ + KERNEL_L_k2m16n6 "prefetcht0 128(%1); prefetcht1 (%6); cmpq $198,%4; cmoveq %2,%5;"\ + KERNEL_L_k2m16n6 "addq $16,%6; addq $8,%4; cmpq %4,%%r13; jnb 7116761b;"\ + "movq %2,%5; negq %4; leaq 14(%%r13,%4,1),%4;"\ + "7116762:\n\t"\ + "xorq %%r15,%%r15; testq %4,%4; jz 7116764f;"\ + "7116763:\n\t"\ + "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5; incq %%r15;"\ + KERNEL_L_k1m16n6 "cmpq $6,%%r15; cmoveq %2,%5; decq %4; jnz 7116763b;"\ + "7116764:\n\t"\ + SAVE_L_m16n6 "addq $32,%2;" +#define COMPUTE_R_m16n6 \ + "movq %%r12,%%r13; sarq $2,%%r13; movq %%r14,%1;" INIT_m16n6\ + "movq %%r13,%4; leaq (%2,%3,4),%5; leaq (%5,%3,2),%5; movq %5,%%r10; cmpq $16,%%r13; jb 7216762f; movq $14,%4;"\ + "7216761:\n\t"\ + KERNEL_R_k2m16n6 "prefetcht0 128(%1,%%r12,8); testq $24,%4; movq $84,%%r15; cmovz %3,%%r15;"\ + KERNEL_R_k2m16n6 "prefetcht1 (%5); subq $63,%5; addq %%r15,%5;"\ + KERNEL_R_k2m16n6 "prefetcht0 128(%1,%%r12,8); prefetcht1 (%6); cmpq $198,%4; cmoveq %%r10,%5;"\ + KERNEL_R_k2m16n6 "addq $16,%6; addq $8,%4; cmpq %4,%%r13; jnb 7216761b;"\ + "movq %%r10,%5; negq %4; leaq 14(%%r13,%4,1),%4;"\ + "7216762:\n\t"\ + "xorq %%r15,%%r15; testq %4,%4; jz 7216764f;"\ + "7216763:\n\t"\ + "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5; incq %%r15;"\ + KERNEL_R_k1m16n6 "cmpq $6,%%r15; cmoveq %%r10,%5; decq %4; jnz 7216763b;"\ + "7216764:\n\t"\ + "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_R_m16n6 "addq $32,%2;" +#define COMPUTE_H_m8n6 \ + "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m8n6\ + "cmpq $8,%4; jb 718760f; movq %2,%5; xorq %%r15,%%r15;"\ + "718769:\n\t"\ + KERNEL_L_k2m8n6 KERNEL_L_k2m8n6 "cmpq $62,%%r15; movq $62,%%r15; cmoveq %3,%%r15;"\ + KERNEL_L_k2m8n6 KERNEL_L_k2m8n6 "prefetcht2 (%5); leaq -31(%5,%%r15,1),%5;"\ + "subq $8,%4; cmpq $8,%4; jnb 718769b;"\ + "718760:\n\t"\ + "testq %4,%4; jz 718762f;"\ + "718761:\n\t"\ + KERNEL_L_k1m8n6 "decq %4; jnz 718761b;"\ + "718762:\n\t"\ + SAVE_L_m8n6 "negq %%r12; leaq (%0,%%r12,8),%0; negq %%r12;" +#define COMPUTE_T_m8n6(side,sim) \ + "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m8n6\ + "cmpq $8,%4; jb 72"#sim"8760f;"\ + "72"#sim"8769:\n\t"\ + KERNEL_##side##_k2m8n6 KERNEL_##side##_k2m8n6 KERNEL_##side##_k2m8n6 KERNEL_##side##_k2m8n6\ + "subq $8,%4; cmpq $8,%4; jnb 72"#sim"8769b;"\ + "72"#sim"8760:\n\t"\ + "testq %4,%4; jz 72"#sim"8762f;"\ + "72"#sim"8761:\n\t"\ + KERNEL_##side##_k1m8n6 "decq %4; jnz 72"#sim"8761b;"\ + "72"#sim"8762:\n\t"\ + SAVE_##side##_m8n6 "addq $32,%2;" +#define COMPUTE_NORMAL(ndim) {\ + next_b = b_pointer + ndim * K;\ + __asm__ __volatile__(\ + "vbroadcastss %9,%%ymm0;"\ + "movq %8,%%r12; salq $2,%%r12; movq %1,%%r14; movq %7,%%r11;"\ + "cmpq $8,%%r11;jb 33101"#ndim"f;"\ + "33109"#ndim":\n\t"\ + COMPUTE_m8n##ndim\ + "subq $8,%%r11;cmpq $8,%%r11;jnb 33109"#ndim"b;"\ + "33101"#ndim":\n\t"\ + "cmpq $4,%%r11;jb 33103"#ndim"f;"\ + COMPUTE_SIMPLE(4,ndim) "subq $4,%%r11;"\ + "33103"#ndim":\n\t"\ + "cmpq $2,%%r11;jb 33104"#ndim"f;"\ + COMPUTE_SIMPLE(2,ndim) "subq $2,%%r11;"\ + "33104"#ndim":\n\t"\ + "testq %%r11,%%r11;jz 33105"#ndim"f;"\ + COMPUTE_SIMPLE(1,ndim)\ + "33105"#ndim":\n\t"\ + "movq %%r14,%1; vzeroupper;"\ + :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(ctemp),"+r"(next_b)\ + :"m"(M),"m"(K),"m"(ALPHA):"r10","r11","r12","r13","r14","r15",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\ + a_pointer -= M * K; b_pointer += ndim * K; c_pointer += (LDC * ndim - M);\ +} +#define COMPUTE_n12 {\ + next_b = b_pointer + 12 * K;\ + __asm__ __volatile__(\ + "vbroadcastss %9,%%ymm0;"\ + "movq %8,%%r12; salq $2,%%r12; movq %1,%%r14; movq %7,%%r11;"\ + "cmpq $16,%%r11;jb 3310112f;"\ + COMPUTE_H_m8n6\ + "3310612:\n\t"\ + COMPUTE_R_m16n6 "subq $8,%%r11; cmpq $16,%%r11;jb 3310712f;"\ + COMPUTE_L_m16n6 "subq $8,%%r11; cmpq $16,%%r11;jnb 3310612b;"\ + COMPUTE_T_m8n6(R,5) "subq $8,%%r11; jmp 3310212f;"\ + "3310712:\n\t"\ + COMPUTE_T_m8n6(L,7) "subq $8,%%r11; jmp 3310212f;"\ + "3310112:\n\t"\ + "cmpq $8,%%r11;jb 3310212f;"\ + COMPUTE_SIMPLE(8,12) "subq $8,%%r11;"\ + "3310212:\n\t"\ + "cmpq $4,%%r11;jb 3310312f;"\ + COMPUTE_SIMPLE(4,12) "subq $4,%%r11;"\ + "3310312:\n\t"\ + "cmpq $2,%%r11;jb 3310412f;"\ + COMPUTE_SIMPLE(2,12) "subq $2,%%r11;"\ + "3310412:\n\t"\ + "testq %%r11,%%r11;jz 3310512f;"\ + COMPUTE_SIMPLE(1,12)\ + "3310512:\n\t"\ + "movq %%r14,%1; vzeroupper;"\ + :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(ctemp),"+r"(next_b)\ + :"m"(M),"m"(K),"m"(ALPHA):"r10","r11","r12","r13","r14","r15",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\ + a_pointer -= M * K; b_pointer += 12 * K; c_pointer += (LDC * 12 - M);\ +} + +#include "common.h" +#include +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC){ + if(m==0||n==0||k==0||alpha==(float)0.0) return 0; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float); + float ALPHA = alpha; + int64_t M = (int64_t)m, K = (int64_t)k, k_count = 0; + BLASLONG n_count = n; + float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B; + for(;n_count>11;n_count-=12) COMPUTE_n12 + for(;n_count>7;n_count-=8) COMPUTE_NORMAL(8) + for(;n_count>3;n_count-=4) COMPUTE_NORMAL(4) + for(;n_count>1;n_count-=2) COMPUTE_NORMAL(2) + if(n_count>0) COMPUTE_NORMAL(1) + return 0; +} + From 9e40c080f2c820d9e899182a8f8cb9b7d400bc55 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 23 Feb 2020 22:39:01 +0100 Subject: [PATCH 53/75] Apply fix from Reference-LAPACK PR390, NaN not propagating --- lapack-netlib/SRC/dcombssq.f | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lapack-netlib/SRC/dcombssq.f b/lapack-netlib/SRC/dcombssq.f index 79f6d95c9..7a1ddd1af 100644 --- a/lapack-netlib/SRC/dcombssq.f +++ b/lapack-netlib/SRC/dcombssq.f @@ -80,6 +80,8 @@ IF( V1( 1 ).GE.V2( 1 ) ) THEN IF( V1( 1 ).NE.ZERO ) THEN V1( 2 ) = V1( 2 ) + ( V2( 1 ) / V1( 1 ) )**2 * V2( 2 ) + ELSE + V1( 2 ) = V1( 2 ) + V2( 2 ) END IF ELSE V1( 2 ) = V2( 2 ) + ( V1( 1 ) / V2( 1 ) )**2 * V1( 2 ) From 87ac1ceb0baaaeec38fc108d537dd8ad5a3b679f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 23 Feb 2020 22:40:40 +0100 Subject: [PATCH 54/75] Apply fix from Reference-LAPACK PR390, NaN not propagating --- lapack-netlib/SRC/scombssq.f | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lapack-netlib/SRC/scombssq.f b/lapack-netlib/SRC/scombssq.f index 76bc0e320..cc51a324b 100644 --- a/lapack-netlib/SRC/scombssq.f +++ b/lapack-netlib/SRC/scombssq.f @@ -80,6 +80,8 @@ IF( V1( 1 ).GE.V2( 1 ) ) THEN IF( V1( 1 ).NE.ZERO ) THEN V1( 2 ) = V1( 2 ) + ( V2( 1 ) / V1( 1 ) )**2 * V2( 2 ) + ELSE + V1( 2 ) = V1( 2 ) + V2( 2 ) END IF ELSE V1( 2 ) = V2( 2 ) + ( V1( 1 ) / V2( 1 ) )**2 * V1( 2 ) From c93ae92579c8b67908b4324f562332d76ebb6b75 Mon Sep 17 00:00:00 2001 From: wuanjun 00447568 Date: Mon, 24 Feb 2020 11:23:39 +0800 Subject: [PATCH 55/75] =?UTF-8?q?[OpenBlas]:benchmark/copy.c=20has=20time?= =?UTF-8?q?=EF=BC=8Cx,y=20data=20loop=20problems?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- benchmark/copy.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/benchmark/copy.c b/benchmark/copy.c index ea5b38d68..d7f58c94f 100644 --- a/benchmark/copy.c +++ b/benchmark/copy.c @@ -129,7 +129,10 @@ int main(int argc, char *argv[]){ int step = 1; struct timeval start, stop; - double time1,timeg; + double time1 = 0.0, timeg = 0.0; + long nanos = 0; + time_t seconds = 0; + struct timespec time_start = { 0, 0 }, time_end = { 0, 0 }; argc--;argv++; @@ -163,35 +166,32 @@ int main(int argc, char *argv[]){ timeg=0; fprintf(stderr, " %6d : ", (int)m); + for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ + y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } for (l=0; l Date: Mon, 24 Feb 2020 19:20:00 +0100 Subject: [PATCH 56/75] Add DYNAMIC_ARCH support for ARMV8 EMAG8180 --- driver/others/dynamic_arm64.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 72f5fcca2..d0664ebcb 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -51,10 +51,11 @@ extern gotoblas_t gotoblas_FALKOR; extern gotoblas_t gotoblas_THUNDERX; extern gotoblas_t gotoblas_THUNDERX2T99; extern gotoblas_t gotoblas_TSV110; +extern gotoblas_t gotoblas_EMAG8180; extern void openblas_warning(int verbose, const char * msg); -#define NUM_CORETYPES 9 +#define NUM_CORETYPES 10 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -78,6 +79,7 @@ static char *corename[] = { "thunderx", "thunderx2t99", "tsv110", + "emag8180", "unknown" }; @@ -91,6 +93,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_THUNDERX) return corename[ 6]; if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 7]; if (gotoblas == &gotoblas_TSV110) return corename[ 8]; + if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; return corename[NUM_CORETYPES]; } @@ -119,6 +122,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 6: return (&gotoblas_THUNDERX); case 7: return (&gotoblas_THUNDERX2T99); case 8: return (&gotoblas_TSV110); + case 9: return (&gotoblas_EMAG8180); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -189,6 +193,13 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_TSV110; } break; + case 0x50: // Ampere + switch (part) + { + case 0x000: // Skylark/EMAG8180 + return &gotoblas_EMAG818; + } + break; case 0x51: // Qualcomm switch (part) { From 320e2648cd90e14c77e015e3cb912e37ce4c0b94 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 24 Feb 2020 19:23:46 +0100 Subject: [PATCH 57/75] Add EMAG8180 to DYNAMIC_CORE list for ARM64 --- Makefile.system | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.system b/Makefile.system index cf9e9bafa..65027bd20 100644 --- a/Makefile.system +++ b/Makefile.system @@ -558,6 +558,7 @@ DYNAMIC_CORE += FALKOR DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 DYNAMIC_CORE += TSV110 +DYNAMIC_CORE += EMAG8180 endif ifeq ($(ARCH), zarch) From 4c5fac5a2bad43ef6ffb333fc9a901e163588de1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 24 Feb 2020 20:15:04 +0100 Subject: [PATCH 58/75] Typo fix --- driver/others/dynamic_arm64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index d0664ebcb..9f42ce4c6 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -197,7 +197,7 @@ static gotoblas_t *get_coretype(void) { switch (part) { case 0x000: // Skylark/EMAG8180 - return &gotoblas_EMAG818; + return &gotoblas_EMAG8180; } break; case 0x51: // Qualcomm From 1ddf9f1067a7abb247ff659d4bda54ea62aeb758 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 24 Feb 2020 20:16:18 +0100 Subject: [PATCH 59/75] Add EMAG8180 to arm64 DYNAMIC_ARCH list for cmake --- cmake/arch.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 8280d6274..d31961c14 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -45,7 +45,7 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180) endif () if (POWER) From ca4f7dceff13fbdc2fb48d17641d4065d3a4edab Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 24 Feb 2020 20:23:18 +0100 Subject: [PATCH 60/75] Add parameters for EMAG8180 DYNAMIC_ARCH support with cmake --- cmake/prebuild.cmake | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index c6d109356..b74a0699b 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -332,6 +332,29 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "EMAG8180") + file(APPEND ${TARGET_CONF_TEMP} + "#define ARMV8\n" + "#define L1_CODE_SIZE\t32768\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t4\n" + "#define L1_DATA_SIZE\t32768\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t4\n" + "#define L2_SIZE\t5262144\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) elseif ("${TCORE}" STREQUAL "POWER6") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE 32768\n" From f8ec538c82e5c5b9e363d4064360c6699a956f79 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 25 Feb 2020 14:30:00 +0100 Subject: [PATCH 61/75] Add Ampere EMAG8180 --- getarch.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/getarch.c b/getarch.c index 1f590390a..c1003c2d1 100644 --- a/getarch.c +++ b/getarch.c @@ -1093,6 +1093,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_EMAG8180 +#define ARMV8 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "EMAG8180" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DEMAG8180 " \ + "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ + "-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "emag8180" +#define CORENAME "EMAG8180" +#endif #ifdef FORCE_ZARCH_GENERIC #define FORCE From 2515e1152f278f2f543156162de62d69213c9088 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 26 Feb 2020 18:36:54 +0800 Subject: [PATCH 62/75] Update cgemm_kernel_8x2_haswell.c --- kernel/x86_64/cgemm_kernel_8x2_haswell.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.c b/kernel/x86_64/cgemm_kernel_8x2_haswell.c index eab8c9ea5..5d3bd599a 100644 --- a/kernel/x86_64/cgemm_kernel_8x2_haswell.c +++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.c @@ -50,7 +50,7 @@ "vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\ acc_m8n2_con(0,1,4,5,6,7,0,8,%1) acc_m8n2_con(0,1,8,9,10,11,0,8,%1,%%r12,1) #define KERNEL_2_k1m8n4 \ - "vpermilps $177,%%ymm0,%%ymm0; vpermilps $177,%%ymm1,%%ymm1;"\ + "vpermilps $177,-64(%0),%%ymm0; vpermilps $177,-32(%0),%%ymm1;"\ acc_m8n2_con(0,1,4,5,6,7,4,12,%1) acc_m8n2_con(0,1,8,9,10,11,4,12,%1,%%r12,1) #define KERNEL_1_k1m8n6 KERNEL_1_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,0,8,%1,%%r12,2) #define KERNEL_2_k1m8n6 KERNEL_2_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,4,12,%1,%%r12,2) From 1b980001dda2af41c470856f65fee34c09d7ad11 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 26 Feb 2020 18:38:12 +0800 Subject: [PATCH 63/75] Update zgemm_kernel_4x2_haswell.c --- kernel/x86_64/zgemm_kernel_4x2_haswell.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.c b/kernel/x86_64/zgemm_kernel_4x2_haswell.c index 3279b8b8c..e3bd7897a 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_haswell.c +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.c @@ -50,7 +50,7 @@ "vmovupd (%0),%%ymm0; vmovupd 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\ acc_m4n2_con(0,1,4,5,6,7,0,16,%1) acc_m4n2_con(0,1,8,9,10,11,0,16,%1,%%r12,1) #define KERNEL_2_k1m4n4 \ - "vpermilpd $5,%%ymm0,%%ymm0; vpermilpd $5,%%ymm1,%%ymm1;"\ + "vpermilpd $5,-64(%0),%%ymm0; vpermilpd $5,-32(%0),%%ymm1;"\ acc_m4n2_con(0,1,4,5,6,7,8,24,%1) acc_m4n2_con(0,1,8,9,10,11,8,24,%1,%%r12,1) #define KERNEL_1_k1m4n6 KERNEL_1_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,0,16,%1,%%r12,2) #define KERNEL_2_k1m4n6 KERNEL_2_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,8,24,%1,%%r12,2) From 8164fd13281e824260346d558e4f5408296d753b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 26 Feb 2020 22:19:57 +0100 Subject: [PATCH 64/75] Always assume server-class cpu count for TSV110 and EMAG8180 --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 055749dc1..2b7b4a050 100644 --- a/param.h +++ b/param.h @@ -2620,7 +2620,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*FIXME: this should be using the cache size, but there is currently no easy way to query that on ARM. So if getarch counted more than 8 cores we simply assume the host is a big desktop or server with abundant cache rather than a phone or embedded device */ -#if NUM_CORES > 8 +#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) #define SGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_P 256 #define CGEMM_DEFAULT_P 256 From 2352331e60a8e0e83baba65da9349734f5edc49b Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Thu, 27 Feb 2020 22:25:19 +0800 Subject: [PATCH 65/75] Update zgemm_kernel_4x2_haswell.c --- kernel/x86_64/zgemm_kernel_4x2_haswell.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.c b/kernel/x86_64/zgemm_kernel_4x2_haswell.c index e3bd7897a..917a3fd48 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_haswell.c +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.c @@ -93,9 +93,9 @@ "movq $10,%5; movq $84,%%r15;"\ #ndim"4441:\n\t"\ "prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\ - "prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ + KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ "testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\ - "prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ + KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ "addq $4,%5; cmpq %5,%%r13; jnb "#ndim"4441b;"\ "movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 15(%6);"\ #ndim"4442:\n\t"\ From dd22eb7621a5fb7d4b65e39ae59679aadb0b3767 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Thu, 27 Feb 2020 22:26:15 +0800 Subject: [PATCH 66/75] Update cgemm_kernel_8x2_haswell.c --- kernel/x86_64/cgemm_kernel_8x2_haswell.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.c b/kernel/x86_64/cgemm_kernel_8x2_haswell.c index 5d3bd599a..08882346d 100644 --- a/kernel/x86_64/cgemm_kernel_8x2_haswell.c +++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.c @@ -93,7 +93,6 @@ "movq $10,%5; movq $84,%%r15;"\ #ndim"8881:\n\t"\ "prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\ - "prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\ KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ "testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\ KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ From a66f4d80c8b39c7d7949f0702c238ab86c690a15 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 27 Feb 2020 23:09:40 +0100 Subject: [PATCH 67/75] Apply MinGW AVX512 compilation fix to fortran options as well original issue was #1708, I see now that the same problem affects gfortran compilation. The underlying issue is said to be fixed (but not yet released) on all branches of gcc as of a few days ago but it will certainly take time to reach mingw/msys. --- Makefile.x86_64 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 99364752f..f2de51ef4 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -15,10 +15,12 @@ CCOMMON_OPT += -march=skylake-avx512 FCOMMON_OPT += -march=skylake-avx512 ifeq ($(OSNAME), CYGWIN_NT) CCOMMON_OPT += -fno-asynchronous-unwind-tables +FCOMMON_OPT += -fno-asynchronous-unwind-tables endif ifeq ($(OSNAME), WINNT) ifeq ($(C_COMPILER), GCC) CCOMMON_OPT += -fno-asynchronous-unwind-tables +FCOMMON_OPT += -fno-asynchronous-unwind-tables endif endif endif From e1062400c4ea7b4cad65b9df8a30fac4224f9737 Mon Sep 17 00:00:00 2001 From: j00520245 Date: Fri, 28 Feb 2020 16:36:53 +0800 Subject: [PATCH 68/75] New add syr benchmark --- benchmark/Makefile | 49 +++++++++++- benchmark/syr.c | 187 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 234 insertions(+), 2 deletions(-) create mode 100644 benchmark/syr.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 51e9c64aa..1d4a220e4 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -12,9 +12,9 @@ include $(TOPDIR)/Makefile.system # ACML 6.1 custom ACML=/home/saar/acml6.1/gfortran64_mp/lib LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm - -# Atlas Ubuntu + +# Atlas Ubuntu #ATLAS=/usr/lib/atlas-base #LIBATLAS = -fopenmp $(ATLAS)/liblapack_atlas.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm @@ -56,6 +56,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ + ssyr.goto dsyr.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ @@ -83,6 +84,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ + ssyr.acml dsyr.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ @@ -109,6 +111,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ + ssyr.goto dsyr.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ @@ -136,6 +139,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ + ssyr.mkl dsyr.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ @@ -162,6 +166,7 @@ else goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ + ssyr.goto dsyr.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ @@ -188,6 +193,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ + ssyr.acml dsyr.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ @@ -214,6 +220,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ + ssyr.atlas dsyr.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ @@ -243,6 +250,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ + ssyr.mkl dsyr.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ @@ -280,6 +288,7 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ sgemm.veclib dgemm.veclib cgemm.veclib zgemm.veclib \ strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \ strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \ + ssyr.veclib dsyr.veclib \ ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \ ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ sger.veclib dger.veclib cger.veclib zger.veclib \ @@ -768,6 +777,36 @@ ztrsm.veclib : ztrsm.$(SUFFIX) ztrsm.essl : ztrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Ssyr #################################################### +ssyr.goto : ssyr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyr.acml : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.atlas : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.mkl : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.veclib : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Dsyr #################################################### +dsyr.goto : dsyr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyr.acml : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.atlas : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.mkl : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.veclib : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ssyrk #################################################### ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) @@ -2078,6 +2117,12 @@ ctrsm.$(SUFFIX) : trsm.c ztrsm.$(SUFFIX) : trsm.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +ssyr.$(SUFFIX) : syr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyr.$(SUFFIX) : syr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + ssyrk.$(SUFFIX) : syrk.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/syr.c b/benchmark/syr.c new file mode 100644 index 000000000..91b5b5904 --- /dev/null +++ b/benchmark/syr.c @@ -0,0 +1,187 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef SYR + +#ifdef DOUBLE +#define SYR BLASFUNC(dsyr) +#else +#define SYR BLASFUNC(ssyr) +#endif + + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x,*a; + FLOAT alpha[] = {1.0, 1.0}; + char *p; + + char uplo='U'; + + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + + blasint m, i, j; + blasint inc_x= 1; + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Inc_x = %d\n", from, to, step,uplo,inc_x); + + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + fprintf(stderr, " %6d : ", (int)m); + + for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for(j = 0; j < m; j++){ + for(i = 0; i < m * COMPSIZE; i++){ + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + gettimeofday( &start, (struct timezone *)0); + + SYR (&uplo, &m, alpha, x, &inc_x, a, &m ); + + gettimeofday( &stop, (struct timezone *)0); + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + + gettimeofday( &start, (struct timezone *)0); + + fprintf(stderr, + " %10.2f MFlops\n", + COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); + + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); From c623a965f95b83bd4340a5ebe1a370b1a900545d Mon Sep 17 00:00:00 2001 From: Ali Saidi Date: Fri, 21 Feb 2020 22:46:58 +0000 Subject: [PATCH 69/75] Add Neoverse-N1 core The implementation is a hybird of the ARMV8 one with some of the improved TX2 rountines along with specifying -march=v8.2-a --- Makefile.arm64 | 17 +++ Makefile.system | 2 + TargetList.txt | 1 + cmake/arch.cmake | 2 +- cmake/prebuild.cmake | 27 +++++ cpuid_arm64.c | 23 +++- driver/others/dynamic_arm64.c | 8 +- getarch.c | 18 ++++ kernel/arm64/KERNEL.NEOVERSEN1 | 189 +++++++++++++++++++++++++++++++++ param.h | 29 +++++ 10 files changed, 312 insertions(+), 4 deletions(-) create mode 100644 kernel/arm64/KERNEL.NEOVERSEN1 diff --git a/Makefile.arm64 b/Makefile.arm64 index c17ea7938..a7cd82e3a 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -24,6 +24,23 @@ CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 endif +# Use a72 tunings because Neoverse-N1 is only available +# in GCC>=9 +ifeq ($(CORE), NEOVERSEN1) +ifeq ($(GCCVERSIONGTEQ7), 1) +ifeq ($(GCCVERSIONGTEQ9), 1) +CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 +FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 +else +CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +endif +else +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +endif +endif + ifeq ($(CORE), THUNDERX) CCOMMON_OPT += -march=armv8-a -mtune=thunderx FCOMMON_OPT += -march=armv8-a -mtune=thunderx diff --git a/Makefile.system b/Makefile.system index a928b6e25..1e30d05a8 100644 --- a/Makefile.system +++ b/Makefile.system @@ -328,6 +328,7 @@ ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) +GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) ifeq ($(GCCVERSIONGT4), 1) @@ -554,6 +555,7 @@ DYNAMIC_CORE += CORTEXA53 DYNAMIC_CORE += CORTEXA57 DYNAMIC_CORE += CORTEXA72 DYNAMIC_CORE += CORTEXA73 +DYNAMIC_CORE += NEOVERSEN1 DYNAMIC_CORE += FALKOR DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 diff --git a/TargetList.txt b/TargetList.txt index 6a57bf1af..5b31df045 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -88,6 +88,7 @@ CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 +NEOVERSEN1 FALKOR THUNDERX THUNDERX2T99 diff --git a/cmake/arch.cmake b/cmake/arch.cmake index d31961c14..9d51f777c 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -45,7 +45,7 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1) endif () if (POWER) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index b74a0699b..44e1473d1 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -229,6 +229,33 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "NEOVERSEN1") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_CODE_SIZE\t65536\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t4\n" + "#define L1_DATA_SIZE\t65536\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t2\n" + "#define L2_SIZE\t1048576\n\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t16\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n" + "#define HAVE_VFPV4\n" + "#define HAVE_VFPV3\n" + "#define HAVE_VFP\n" + "#define HAVE_NEON\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) elseif ("${TCORE}" STREQUAL "FALKOR") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t65536\n" diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 5868af75c..4103216e6 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -34,6 +34,7 @@ #define CPU_CORTEXA57 3 #define CPU_CORTEXA72 4 #define CPU_CORTEXA73 5 +#define CPU_NEOVERSEN1 11 // Qualcomm #define CPU_FALKOR 6 // Cavium @@ -55,7 +56,8 @@ static char *cpuname[] = { "THUNDERX", "THUNDERX2T99", "TSV110", - "EMAG8180" + "EMAG8180", + "NEOVERSEN1" }; static char *cpuname_lower[] = { @@ -69,7 +71,8 @@ static char *cpuname_lower[] = { "thunderx", "thunderx2t99", "tsv110", - "emag8180" + "emag8180", + "neoversen1" }; int get_feature(char *search) @@ -144,6 +147,8 @@ int detect(void) return CPU_CORTEXA72; else if (strstr(cpu_part, "0xd09")) return CPU_CORTEXA73; + else if (strstr(cpu_part, "0xd0c")) + return CPU_NEOVERSEN1; } // Qualcomm else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00")) @@ -285,6 +290,20 @@ void get_cpuconfig(void) printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); break; + case CPU_NEOVERSEN1: + printf("#define %s\n", cpuname[d]); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 4\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 4\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + break; case CPU_FALKOR: printf("#define FALKOR\n"); diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 9f42ce4c6..11ef2725c 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -52,10 +52,11 @@ extern gotoblas_t gotoblas_THUNDERX; extern gotoblas_t gotoblas_THUNDERX2T99; extern gotoblas_t gotoblas_TSV110; extern gotoblas_t gotoblas_EMAG8180; +extern gotoblas_t gotoblas_NEOVERSEN1; extern void openblas_warning(int verbose, const char * msg); -#define NUM_CORETYPES 10 +#define NUM_CORETYPES 11 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -80,6 +81,7 @@ static char *corename[] = { "thunderx2t99", "tsv110", "emag8180", + "neoversen1", "unknown" }; @@ -94,6 +96,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 7]; if (gotoblas == &gotoblas_TSV110) return corename[ 8]; if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; + if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; return corename[NUM_CORETYPES]; } @@ -123,6 +126,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 7: return (&gotoblas_THUNDERX2T99); case 8: return (&gotoblas_TSV110); case 9: return (&gotoblas_EMAG8180); + case 10: return (&gotoblas_NEOVERSEN1); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -168,6 +172,8 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_CORTEXA72; case 0xd09: // Cortex A73 return &gotoblas_CORTEXA73; + case 0xd0c: // Neoverse N1 + return &gotoblas_NEOVERSEN1; } break; case 0x42: // Broadcom diff --git a/getarch.c b/getarch.c index d29f6369c..30ca290e3 100644 --- a/getarch.c +++ b/getarch.c @@ -1028,6 +1028,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_NEOVERSEN1 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "NEOVERSEN1" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DNEOVERSEN1 " \ + "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" \ + "-march=armv8.2-a -mtune=cortex-a72" +#define LIBNAME "neoversen1" +#define CORENAME "NEOVERSEN1" +#else +#endif + + #ifdef FORCE_FALKOR #define FORCE #define ARCHITECTURE "ARM64" diff --git a/kernel/arm64/KERNEL.NEOVERSEN1 b/kernel/arm64/KERNEL.NEOVERSEN1 new file mode 100644 index 000000000..ea010db42 --- /dev/null +++ b/kernel/arm64/KERNEL.NEOVERSEN1 @@ -0,0 +1,189 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c + +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c + +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S + +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c + +SNRM2KERNEL = scnrm2_thunderx2t99.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c + +DDOTKERNEL = dot_thunderx2t99.c +SDOTKERNEL = dot_thunderx2t99.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif +ifeq ($(SGEMM_UNROLL_M), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S +else +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(SGEMM_UNROLL_N), 16) +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +else +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +endif +ifeq ($(SGEMM_UNROLL_N), 4) +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +else +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +endif +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/param.h b/param.h index 2b7b4a050..726639a4a 100644 --- a/param.h +++ b/param.h @@ -2705,6 +2705,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#elif defined(NEOVERSEN1) + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #else // Other/undetected ARMv8 cores #define SGEMM_DEFAULT_UNROLL_M 16 From 19f3a4091c41ec50b1f956e916680241cf202c91 Mon Sep 17 00:00:00 2001 From: Ali Saidi Date: Sat, 22 Feb 2020 05:07:55 +0000 Subject: [PATCH 70/75] Make rpcc() on arm64 get closer to what x86 returns The Arm implementation of rpcc() uses the architected timer which is defined by the SBSA to be between 10-400MHz. These numbers are much smaller than the cycle counter frequency used by x86. Make the numbers closer by shifting the cycle counter up by the number of leading zeros in the cntfrq_el0 register which gets us closer to a noraml cpu clock cycle range. --- common_arm64.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/common_arm64.h b/common_arm64.h index 5951e1ee5..66a1d1dc4 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -81,10 +81,12 @@ static void __inline blas_lock(volatile BLASULONG *address){ #if !defined(OS_DARWIN) && !defined (OS_ANDROID) static __inline BLASULONG rpcc(void){ BLASULONG ret = 0; + blasint shift; __asm__ __volatile__ ("isb; mrs %0,cntvct_el0":"=r"(ret)); + __asm__ __volatile__ ("mrs %0,cntfrq_el0; clz %w0, %w0":"=&r"(shift)); - return ret; + return ret << shift; } #define RPCC_DEFINED From 0af9991cc9d0d3696847eaaaa8fa4288deea9146 Mon Sep 17 00:00:00 2001 From: Ali Saidi Date: Fri, 21 Feb 2020 23:43:43 +0000 Subject: [PATCH 71/75] Use wait-for-event to not spin in the blas_lock --- common_arm64.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common_arm64.h b/common_arm64.h index 5951e1ee5..52f7451d3 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -53,16 +53,16 @@ static void __inline blas_lock(volatile BLASULONG *address){ BLASULONG ret; do { - while (*address) {YIELDING;}; - __asm__ __volatile__( "mov x4, #1 \n\t" + "sevl \n\t" "1: \n\t" + "wfe \n\t" + "2: \n\t" "ldaxr x2, [%1] \n\t" "cbnz x2, 1b \n\t" - "2: \n\t" "stxr w3, x4, [%1] \n\t" - "cbnz w3, 1b \n\t" + "cbnz w3, 2b \n\t" "mov %0, #0 \n\t" : "=r"(ret), "=r"(address) : "1"(address) From 97ce6bbce2580d1a3d8c9844afcac431d749abdc Mon Sep 17 00:00:00 2001 From: Ali Saidi Date: Sat, 29 Feb 2020 17:27:18 +0000 Subject: [PATCH 72/75] Fix barriers in level3_thread --- driver/level3/level3_thread.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index bf558447e..ca0085e71 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -351,8 +351,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Make sure if no one is using workspace */ START_RPCC(); for (i = 0; i < args -> nthreads; i++) - while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; STOP_RPCC(waiting1); + MB; #if defined(FUSED_GEMM) && !defined(TIMING) @@ -395,10 +396,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } #endif + WMB; /* Set flag so other threads can access local region of B */ for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; - WMB; } /* Get regions of B from other threads and apply kernel */ @@ -417,8 +418,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Wait until other region of B is initialized */ START_RPCC(); - while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;}; + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; STOP_RPCC(waiting2); + MB; /* Apply kernel with local region of A and part of other region of B */ START_RPCC(); @@ -434,8 +436,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Clear synchronization flag if this thread is done with other region of B */ if (m_to - m_from == min_i) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; } } } while (current != mypos); @@ -477,8 +479,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Clear synchronization flag if this thread is done with region of B */ if (is + min_i >= m_to) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; } } @@ -497,10 +499,11 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, START_RPCC(); for (i = 0; i < args -> nthreads; i++) { for (js = 0; js < DIVIDE_RATE; js++) { - while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;}; + while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;}; } } STOP_RPCC(waiting3); + MB; #ifdef TIMING BLASLONG waiting = waiting1 + waiting2 + waiting3; @@ -705,7 +708,7 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); } } } - + WMB; /* Execute parallel computation */ exec_blas(nthreads, queue); } From 4f371b0fbf8219270e37cb7827850ac13c8686d5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 1 Mar 2020 23:45:58 +0100 Subject: [PATCH 73/75] Use POWER8 kernels on big-endian POWER9 for now --- kernel/power/KERNEL.POWER9 | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 4bfa017e1..aabb5d976 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -1,3 +1,7 @@ +ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) +include $(KERNELDIR)/KERNEL.POWER8 +else + #SGEMM_BETA = ../generic/gemm_beta.c #DGEMM_BETA = ../generic/gemm_beta.c #CGEMM_BETA = ../generic/zgemm_beta.c @@ -206,3 +210,5 @@ QCABS_KERNEL = ../generic/cabs.c #Dump kernel CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c + +endif From f14013da7fda056a2ee42ccf88f14b46b91686ef Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 2 Mar 2020 00:01:22 +0100 Subject: [PATCH 74/75] Update with 0.3.9 changes --- Changelog.txt | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index d66b2719a..5f924629b 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,48 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.9 + 1-Mar-2020 + + common: + * Fixed a miscompilation of the GETRF functions with CMAKE + * Imported bugfix 390 from LAPACK (missing NaN propagation in xCOMBSSQ) + * The size of the memory buffer used for splitting GEMM tasks across + multiple threads can now be configured in the build system. + +POWER: + * Fixed several compilation problems related to endianness + and ELF version on POWER8 and POWER9 + * Fixed use of the absolute value IAMIN/IAMAX instead of IMIN/IMAX + * Fixed a race condition in the level3 blas code + +MIPS64: + * Fixed use of the absoltute value IAMIN/IAMAX instead of IMIN/IMAX + +ARMV7: + * Fixed a race condition in the level3 blas code + * Fixed compilation on Android +ARMV8: + * Added support for Ampere EMAG8180 + * Added support for Neoverse N1 + * Improved performance of the blas_lock function + * Fixed a race condition in the level3 blas code + * Fixed a performance regression on TSV110-based servers + +x86_64: + * Fixed a long-standing error with undeclared register overwrites + in the DSCAL microkernel for HASWELL,SKYLAKEX and ZEN + * Fixed a long-standing bug in the SSE implementation of IAMAX + * Fixed a CMAKE build failure with DYNAMIC_ARCH + * Fixed cpu autodetection of Goldmont+, Cannon Lake and Ice Lake + * Fixed a compilation failure on OSX with compiler name containing dash + * Fixed compilation with MinGW on SkylakeX + * Improved speed of the AVX512 GEMM3M kernel on SkylakeX + * Added an AVX512 STRMM kernel for SkylakeX + * Improved GEMM performance on Haswell and Zen + +zarch: + * fixed compilation of the DYNAMIC_ARCH code + ==================================================================== Version 0.3.8 9-Feb-2020 From d221c50f2741b31b83e3cbcc005977cb0fe47bc3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 2 Mar 2020 00:02:36 +0100 Subject: [PATCH 75/75] Add Ampere EMAG8180 --- TargetList.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/TargetList.txt b/TargetList.txt index 5b31df045..f4a40ed02 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -89,6 +89,7 @@ CORTEXA57 CORTEXA72 CORTEXA73 NEOVERSEN1 +EMAG8180 FALKOR THUNDERX THUNDERX2T99