From 2fc712469d1e29220e2e3f3f83d2ab7b17c0bc60 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Dec 2018 13:56:06 +0100 Subject: [PATCH 01/23] Avoid creating spurious non-suffixed c/zgemm_kernels Plain cgemm_kernel and zgemm_kernel are not used anywhere, only cgemm_kernel_b etc. Needlessly building them (without any define like NN, CN, etc.) just happened to work on most platforms, but not on arm64. See #1870 --- kernel/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 947114ebe..2a330df4e 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -125,10 +125,13 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) set(USE_TRMM true) endif () - foreach (float_type ${FLOAT_TYPES}) + foreach (float_type SINGLE DOUBLE) string(SUBSTRING ${float_type} 0 1 float_char) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) + endforeach() + foreach (float_type ${FLOAT_TYPES}) + string(SUBSTRING ${float_type} 0 1 float_char) if (${float_char}GEMMINCOPY) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) endif () From 7639f2e1f004d441757a43bcdfff6c32611a2aa3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Dec 2018 14:04:27 +0100 Subject: [PATCH 02/23] Rewrite the conditional for OSX to fix cmake parsing on others The Makefile variable parser in utils.cmake currently does not handle conditionals. Having the definitions for non-OSX last will at least make cmake builds work again on non-OSX platforms. --- kernel/arm64/KERNEL.ARMV8 | 63 +++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index 07d6cee99..a2a435738 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -104,8 +104,38 @@ CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S DSDOTKERNEL = dot.S -ifneq ($(OS_DARWIN)$(CROSS),11) +ifeq ($(OS_DARWIN)$(CROSS),11) +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +else SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) @@ -173,35 +203,4 @@ ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) -else - -STRMMKERNEL = ../generic/trmmkernel_2x2.c -DTRMMKERNEL = ../generic/trmmkernel_2x2.c -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c - -SGEMMKERNEL = ../generic/gemmkernel_2x2.c -SGEMMONCOPY = ../generic/gemm_ncopy_2.c -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DGEMMKERNEL = ../generic/gemmkernel_2x2.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - endif From 0b095166788b28dc9270edca2eb62ef2f201f6fe Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Dec 2018 18:33:05 +0100 Subject: [PATCH 03/23] Fix missing parameter in popen call --- cpuid_power.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_power.c b/cpuid_power.c index 23e98ebb0..82a3f4aac 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -136,7 +136,7 @@ int detect(void){ char buffer[512], *p; p = (char *)NULL; - infile = popen("prtconf|grep 'Processor Type'"); + infile = popen("prtconf|grep 'Processor Type'", "r"); while (fgets(buffer, sizeof(buffer), infile)){ if (!strncmp("Pro", buffer, 3)){ p = strchr(buffer, ':') + 2; From 2b355592e34b07f4d0c5f81c275c902c0578236d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 7 Dec 2018 16:25:55 +0100 Subject: [PATCH 04/23] Make sure to use the arm version of dynamic.c in ARM64 DYNAMIC_ARCH cf. #1908 --- driver/others/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index e20b14e79..f7cce4d46 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -47,7 +47,11 @@ GenerateNamedObjects("abs.c" "DOUBLE" "z_abs" 0 "" "" 1) GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" "" 1) if (DYNAMIC_ARCH) - list(APPEND COMMON_SOURCES dynamic.c) + if (ARM64) + list(APPEND COMMON_SOURcES dynamic_arm64.c) + else () + list(APPEND COMMON_SOURCES dynamic.c) + endif () else () list(APPEND COMMON_SOURCES parameter.c) endif () From 133c278ee565e91ff65d627b363aee36b71feeba Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 7 Dec 2018 17:42:23 +0100 Subject: [PATCH 05/23] Add DYNAMIC_CORE list for ARM64 cf #1908 --- cmake/arch.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 52fb64eaa..63fb86fa2 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -44,6 +44,10 @@ endif () if (DYNAMIC_ARCH) + if (ARM64) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99) + endif () + if (X86) set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO) endif () From 0bf6d74e5f9855ddf2028dcc099ee58e4f13446b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 7 Dec 2018 19:37:33 +0100 Subject: [PATCH 06/23] Fix typo in previous commit for arm dynamic arch --- driver/others/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index f7cce4d46..a07e00b3b 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -48,7 +48,7 @@ GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" if (DYNAMIC_ARCH) if (ARM64) - list(APPEND COMMON_SOURcES dynamic_arm64.c) + list(APPEND COMMON_SOURCES dynamic_arm64.c) else () list(APPEND COMMON_SOURCES dynamic.c) endif () From 38cc63859131921885b80ed5139304dc80c5a163 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Dec 2018 21:09:26 +0100 Subject: [PATCH 07/23] Avoid adding blanket march=skylake-avx512 to dynamic_arch builds --- Makefile.x86_64 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index f2647fb7d..dbee28079 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -9,6 +9,7 @@ endif endif ifeq ($(CORE), SKYLAKEX) +ifndef DYNAMIC_ARCH ifndef NO_AVX512 CCOMMON_OPT += -march=skylake-avx512 FCOMMON_OPT += -march=skylake-avx512 @@ -22,6 +23,7 @@ endif endif endif endif +endif ifeq ($(OSNAME), Interix) ARFLAGS = -m x64 From 06f7d78d70b95f936765312b8c8b3cadf7265ae5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Dec 2018 21:10:38 +0100 Subject: [PATCH 08/23] Add -march=skylake-avx512 to SkylakeX part of DYNAMIC_ARCH builds --- kernel/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/Makefile b/kernel/Makefile index 923ffc363..6e178f80b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -6,7 +6,11 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system ifdef TARGET_CORE +ifeq ($(TARGET_CORE), SKYLAKEX) +override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 +else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) +endif BUILD_KERNEL = 1 KDIR = TSUFFIX = _$(TARGET_CORE) From 51aec8e96b78f93f9a6dcbbf1edd212c5f1ab2ca Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Dec 2018 22:47:32 +0100 Subject: [PATCH 09/23] make sure the added march=skylake-avx512 does not cause problems on Windows --- kernel/Makefile | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 6e178f80b..a441bde7c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -7,7 +7,15 @@ include $(TOPDIR)/Makefile.system ifdef TARGET_CORE ifeq ($(TARGET_CORE), SKYLAKEX) -override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 + ifeq ($(OSNAME), CYGWIN_NT) + override CFLAGS += -fno-asynchronous-unwind-tables + endif + ifeq ($(OSNAME), WINNT) + ifeq ($(C_COMPILER), GCC) + override CFLAGS += -fno-asynchronous-unwind-tables + endif + endif else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif From cdc668d82b7afd6a2ddee33987ecfebcaccebc2d Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 12 Dec 2018 16:45:57 +0000 Subject: [PATCH 10/23] Add a "sgemm direct" mode for small matrixes OpenBLAS has a fancy algorithm for copying the input data while laying it out in a more CPU friendly memory layout. This is great for large matrixes; the cost of the copy is easily ammortized by the gains from the better memory layout. But for small matrixes (on CPUs that can do efficient unaligned loads) this copy can be a net loss. This patch adds (for SKYLAKEX initially) a "sgemm direct" mode, that bypasses the whole copy machinary for ALPHA=1/BETA=0/... standard arguments, for small matrixes only. What is small? For the non-threaded case this has been measured to be in the M*N*K = 28 * 512 * 512 range, while in the threaded case it's less, around M*N*K = 1 * 512 * 512 --- common_level3.h | 8 + interface/gemm.c | 8 + kernel/x86_64/sgemm_kernel_16x4_skylakex.c | 467 ++++++++++++++++++++- param.h | 1 + 4 files changed, 483 insertions(+), 1 deletion(-) diff --git a/common_level3.h b/common_level3.h index 1f5490baa..6fa902be8 100644 --- a/common_level3.h +++ b/common_level3.h @@ -47,6 +47,14 @@ __global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *); extern "C" { #endif +extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K, + float * A, BLASLONG strideA, + float * B, BLASLONG strideB, + float * R, BLASLONG strideR); + +extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); + + int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double, diff --git a/interface/gemm.c b/interface/gemm.c index a3bac5984..97e71bc85 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -271,6 +271,14 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS PRINT_DEBUG_CNAME; +#if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT) + if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && sgemm_kernel_direct_performant(m,n,k)) { + sgemm_kernel_direct(m, n, k, a, lda, b, ldb, c, ldc); + return; + } + +#endif + #ifndef COMPLEX args.alpha = (void *)α args.beta = (void *)β diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c index 10d3d22ed..3246e681f 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c @@ -760,7 +760,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************************************/ int __attribute__ ((noinline)) -CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG ldc) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, float * __restrict B, float * __restrict C, BLASLONG ldc) { unsigned long M = m, N = n, K = k; if (M == 0) @@ -1175,3 +1175,468 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f return 0; } + + +/* + * "Direct sgemm" code. This code operates directly on the inputs and outputs + * of the sgemm call, avoiding the copies, memory realignments and threading, + * and only supports alpha = 1 and beta = 0. + * This is a common case and provides value for relatively small matrixes. + * For larger matrixes the "regular" sgemm code is superior, there the cost of + * copying/shuffling the B matrix really pays off. + */ + + + +#define DECLARE_RESULT_512(N,M) __m512 result##N##M = _mm512_setzero_ps() +#define BROADCAST_LOAD_A_512(N,M) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)])) +#define LOAD_B_512(N,M) __m512 Bval##N = _mm512_loadu_ps(&B[strideB * k + j + (N*16)]) +#define MATMUL_512(N,M) result##N##M = _mm512_fmadd_ps(Aval##M, Bval##N , result##N##M) +#define STORE_512(N,M) _mm512_storeu_ps(&R[(i+M) * strideR + j+(N*16)], result##N##M) + + +#define DECLARE_RESULT_256(N,M) __m256 result##N##M = _mm256_setzero_ps() +#define BROADCAST_LOAD_A_256(N,M) __m256 Aval##M = _mm256_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)])) +#define LOAD_B_256(N,M) __m256 Bval##N = _mm256_loadu_ps(&B[strideB * k + j + (N*8)]) +#define MATMUL_256(N,M) result##N##M = _mm256_fmadd_ps(Aval##M, Bval##N , result##N##M) +#define STORE_256(N,M) _mm256_storeu_ps(&R[(i+M) * strideR + j+(N*8)], result##N##M) + +#define DECLARE_RESULT_128(N,M) __m128 result##N##M = _mm_setzero_ps() +#define BROADCAST_LOAD_A_128(N,M) __m128 Aval##M = _mm_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)])) +#define LOAD_B_128(N,M) __m128 Bval##N = _mm_loadu_ps(&B[strideB * k + j + (N*4)]) +#define MATMUL_128(N,M) result##N##M = _mm_fmadd_ps(Aval##M, Bval##N , result##N##M) +#define STORE_128(N,M) _mm_storeu_ps(&R[(i+M) * strideR + j+(N*4)], result##N##M) + +#define DECLARE_RESULT_SCALAR(N,M) float result##N##M = 0; +#define BROADCAST_LOAD_A_SCALAR(N,M) float Aval##M = A[k + strideA * (i + M)]; +#define LOAD_B_SCALAR(N,M) float Bval##N = B[k * strideB + j + N]; +#define MATMUL_SCALAR(N,M) result##N##M += Aval##M * Bval##N; +#define STORE_SCALAR(N,M) R[(i+M) * strideR + j + N] = result##N##M; + +int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) +{ + int mnk = M * N * K; + /* large matrixes -> not performant */ + if (mnk >= 28 * 512 * 512) + return 0; + + /* + * if the B matrix is not a nice multiple if 4 we get many unaligned accesses, + * and the regular sgemm copy/realignment of data pays off much quicker + */ + if ((N & 3) != 0 && (mnk >= 8 * 512 * 512)) + return 0; + +#ifdef SMP + /* if we can run multithreaded, the threading changes the based threshold */ + if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1) + return 0; +#endif + + return 1; +} + + + +void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) +{ + int i, j, k; + + int m4 = M & ~3; + int m2 = M & ~1; + + int n64 = N & ~63; + int n32 = N & ~31; + int n16 = N & ~15; + int n8 = N & ~7; + int n4 = N & ~3; + int n2 = N & ~1; + + i = 0; + + for (i = 0; i < m4; i+=4) { + + for (j = 0; j < n64; j+= 64) { + k = 0; + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + BROADCAST_LOAD_A_512(x, 2); + BROADCAST_LOAD_A_512(x, 3); + + LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2); + STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3); + } + + for (; j < n32; j+= 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + BROADCAST_LOAD_A_512(x, 2); + BROADCAST_LOAD_A_512(x, 3); + + LOAD_B_512(0, x); LOAD_B_512(1, x); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + } + + for (; j < n16; j+= 16) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + BROADCAST_LOAD_A_512(x, 2); + BROADCAST_LOAD_A_512(x, 3); + + LOAD_B_512(0, x); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + } + + for (; j < n8; j+= 8) { + DECLARE_RESULT_256(0, 0); + DECLARE_RESULT_256(0, 1); + DECLARE_RESULT_256(0, 2); + DECLARE_RESULT_256(0, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_256(x, 0); + BROADCAST_LOAD_A_256(x, 1); + BROADCAST_LOAD_A_256(x, 2); + BROADCAST_LOAD_A_256(x, 3); + + LOAD_B_256(0, x); + + MATMUL_256(0, 0); + MATMUL_256(0, 1); + MATMUL_256(0, 2); + MATMUL_256(0, 3); + } + STORE_256(0, 0); + STORE_256(0, 1); + STORE_256(0, 2); + STORE_256(0, 3); + } + + for (; j < n4; j+= 4) { + DECLARE_RESULT_128(0, 0); + DECLARE_RESULT_128(0, 1); + DECLARE_RESULT_128(0, 2); + DECLARE_RESULT_128(0, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_128(x, 0); + BROADCAST_LOAD_A_128(x, 1); + BROADCAST_LOAD_A_128(x, 2); + BROADCAST_LOAD_A_128(x, 3); + + LOAD_B_128(0, x); + + MATMUL_128(0, 0); + MATMUL_128(0, 1); + MATMUL_128(0, 2); + MATMUL_128(0, 3); + } + STORE_128(0, 0); + STORE_128(0, 1); + STORE_128(0, 2); + STORE_128(0, 3); + } + + for (; j < n2; j+= 2) { + DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0); + DECLARE_RESULT_SCALAR(0, 1); DECLARE_RESULT_SCALAR(1, 1); + DECLARE_RESULT_SCALAR(0, 2); DECLARE_RESULT_SCALAR(1, 2); + DECLARE_RESULT_SCALAR(0, 3); DECLARE_RESULT_SCALAR(1, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(x, 0); + BROADCAST_LOAD_A_SCALAR(x, 1); + BROADCAST_LOAD_A_SCALAR(x, 2); + BROADCAST_LOAD_A_SCALAR(x, 3); + + LOAD_B_SCALAR(0, x); LOAD_B_SCALAR(1, x); + + MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0); + MATMUL_SCALAR(0, 1); MATMUL_SCALAR(1, 1); + MATMUL_SCALAR(0, 2); MATMUL_SCALAR(1, 2); + MATMUL_SCALAR(0, 3); MATMUL_SCALAR(1, 3); + } + STORE_SCALAR(0, 0); STORE_SCALAR(1, 0); + STORE_SCALAR(0, 1); STORE_SCALAR(1, 1); + STORE_SCALAR(0, 2); STORE_SCALAR(1, 2); + STORE_SCALAR(0, 3); STORE_SCALAR(1, 3); + } + + for (; j < N; j++) { + DECLARE_RESULT_SCALAR(0, 0) + DECLARE_RESULT_SCALAR(0, 1) + DECLARE_RESULT_SCALAR(0, 2) + DECLARE_RESULT_SCALAR(0, 3) + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(0, 0); + BROADCAST_LOAD_A_SCALAR(0, 1); + BROADCAST_LOAD_A_SCALAR(0, 2); + BROADCAST_LOAD_A_SCALAR(0, 3); + + LOAD_B_SCALAR(0, 0); + + MATMUL_SCALAR(0, 0); + MATMUL_SCALAR(0, 1); + MATMUL_SCALAR(0, 2); + MATMUL_SCALAR(0, 3); + } + STORE_SCALAR(0, 0); + STORE_SCALAR(0, 1); + STORE_SCALAR(0, 2); + STORE_SCALAR(0, 3); + } + } + + for (; i < m2; i+=2) { + j = 0; + + for (; j < n64; j+= 64) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + + LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + } + + for (; j < n32; j+= 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + + LOAD_B_512(0, x); LOAD_B_512(1, x); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + } + + + for (; j < n16; j+= 16) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + + LOAD_B_512(0, x); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_512(0, 0); + STORE_512(0, 1); + } + + for (; j < n8; j+= 8) { + DECLARE_RESULT_256(0, 0); + DECLARE_RESULT_256(0, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_256(x, 0); + BROADCAST_LOAD_A_256(x, 1); + + LOAD_B_256(0, x); + + MATMUL_256(0, 0); + MATMUL_256(0, 1); + } + STORE_256(0, 0); + STORE_256(0, 1); + } + + for (; j < n4; j+= 4) { + DECLARE_RESULT_128(0, 0); + DECLARE_RESULT_128(0, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_128(x, 0); + BROADCAST_LOAD_A_128(x, 1); + + LOAD_B_128(0, x); + + MATMUL_128(0, 0); + MATMUL_128(0, 1); + } + STORE_128(0, 0); + STORE_128(0, 1); + } + for (; j < n2; j+= 2) { + DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0); + DECLARE_RESULT_SCALAR(0, 1); DECLARE_RESULT_SCALAR(1, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(x, 0); + BROADCAST_LOAD_A_SCALAR(x, 1); + + LOAD_B_SCALAR(0, x); LOAD_B_SCALAR(1, x); + + MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0); + MATMUL_SCALAR(0, 1); MATMUL_SCALAR(1, 1); + } + STORE_SCALAR(0, 0); STORE_SCALAR(1, 0); + STORE_SCALAR(0, 1); STORE_SCALAR(1, 1); + } + + for (; j < N; j++) { + DECLARE_RESULT_SCALAR(0, 0); + DECLARE_RESULT_SCALAR(0, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(0, 0); + BROADCAST_LOAD_A_SCALAR(0, 1); + + LOAD_B_SCALAR(0, 0); + + MATMUL_SCALAR(0, 0); + MATMUL_SCALAR(0, 1); + } + STORE_SCALAR(0, 0); + STORE_SCALAR(0, 1); + } + } + + for (; i < M; i+=1) { + j = 0; + for (; j < n64; j+= 64) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + } + for (; j < n32; j+= 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + LOAD_B_512(0, x); LOAD_B_512(1, x); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_512(0, 0); STORE_512(1, 0); + } + + + for (; j < n16; j+= 16) { + DECLARE_RESULT_512(0, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + + LOAD_B_512(0, x); + + MATMUL_512(0, 0); + } + STORE_512(0, 0); + } + + for (; j < n8; j+= 8) { + DECLARE_RESULT_256(0, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_256(x, 0); + LOAD_B_256(0, x); + MATMUL_256(0, 0); + } + STORE_256(0, 0); + } + + for (; j < n4; j+= 4) { + DECLARE_RESULT_128(0, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_128(x, 0); + LOAD_B_128(0, x); + MATMUL_128(0, 0); + } + STORE_128(0, 0); + } + + for (; j < n2; j+= 2) { + DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(x, 0); + LOAD_B_SCALAR(0, 0); LOAD_B_SCALAR(1, 0); + MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0); + } + STORE_SCALAR(0, 0); STORE_SCALAR(1, 0); + } + + for (; j < N; j++) { + DECLARE_RESULT_SCALAR(0, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(0, 0); + LOAD_B_SCALAR(0, 0); + MATMUL_SCALAR(0, 0); + } + STORE_SCALAR(0, 0); + } + } +} \ No newline at end of file diff --git a/param.h b/param.h index 8f56cdaaa..7a18d82d7 100644 --- a/param.h +++ b/param.h @@ -1628,6 +1628,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SWITCH_RATIO 32 #define GEMM_PREFERED_SIZE 32 +#define USE_SGEMM_KERNEL_DIRECT 1 #ifdef ARCH_X86 From 00dc09ad198aedec53fd05ea1b13d72d7a9a517a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 15 Dec 2018 13:18:59 +0000 Subject: [PATCH 11/23] Use the skylake sgemm beta code also for haswell with a few small changes it's possible to use the skylake sgemm code also for haswell, this gives a modest gain (10% range) for smallish matrixes but does wonders for very skinny matrixes --- kernel/x86_64/KERNEL.HASWELL | 1 + kernel/x86_64/sgemm_beta_skylakex.c | 15 +++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 848de38df..2aec60064 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -33,6 +33,7 @@ ZAXPYKERNEL = zaxpy.c STRMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMKERNEL = sgemm_kernel_16x4_haswell.S +SGEMM_BETA = sgemm_beta_skylakex.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index 498c46f0d..e8653112c 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -61,11 +61,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset = c; if (beta == ZERO){ - __m512 z_zero; - __m256 y_zero; +#ifdef __AVX512CD__ + __m512 z_zero = _mm512_setzero_ps(); +#endif + __m256 y_zero = _mm256_setzero_ps(); - z_zero = _mm512_setzero_ps(); - y_zero = _mm256_setzero_ps(); j = n; do { c_offset1 = c_offset; @@ -74,8 +74,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, i = m; while (i >= 32) { +#ifdef __AVX512CD__ _mm512_storeu_ps(c_offset1, z_zero); _mm512_storeu_ps(c_offset1 + 16, z_zero); +#else + _mm256_storeu_ps(c_offset1, y_zero); + _mm256_storeu_ps(c_offset1 + 8, y_zero); + _mm256_storeu_ps(c_offset1 + 16, y_zero); + _mm256_storeu_ps(c_offset1 + 24, y_zero); +#endif c_offset1 += 32; i -= 32; } From 0586899a10b97bf1baf50e4988d18b4268317420 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 15 Dec 2018 13:43:07 +0000 Subject: [PATCH 12/23] Use sgemm_ncopy_4_skylakex.c also for Haswell sgemm_ncopy_4_skylakex.c uses SSE transpose operations where the real perf win happens; this also works great for Haswell. This gives double digit percentage gains on small and skinny matrices --- kernel/x86_64/KERNEL.HASWELL | 2 +- kernel/x86_64/sgemm_ncopy_4_skylakex.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 2aec60064..422e6c315 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -36,7 +36,7 @@ SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMM_BETA = sgemm_beta_skylakex.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c -SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMONCOPY = sgemm_ncopy_4_skylakex.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/x86_64/sgemm_ncopy_4_skylakex.c b/kernel/x86_64/sgemm_ncopy_4_skylakex.c index 8577e3b38..6b2b0f5b1 100644 --- a/kernel/x86_64/sgemm_ncopy_4_skylakex.c +++ b/kernel/x86_64/sgemm_ncopy_4_skylakex.c @@ -49,8 +49,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __ FLOAT *b_offset; FLOAT ctemp1, ctemp2, ctemp3, ctemp4; FLOAT ctemp5, ctemp6, ctemp7, ctemp8; - FLOAT ctemp9, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp9, ctemp13; a_offset = a; b_offset = b; From 1ebe5c0f499575d42e85b4f89e4205882be8ebe3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Dec 2018 19:35:35 +0100 Subject: [PATCH 13/23] Add -march=haswell to HASWELL part of DYNAMIC_ARCH build --- kernel/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/Makefile b/kernel/Makefile index a441bde7c..d86411d91 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -16,6 +16,8 @@ ifeq ($(TARGET_CORE), SKYLAKEX) override CFLAGS += -fno-asynchronous-unwind-tables endif endif +elseifeq($(TARGET_CORE), HASWELL) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=haswell else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif From 2a3190dc76a3eb60fabe298b1df04c46cdca5350 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Dec 2018 20:17:44 +0100 Subject: [PATCH 14/23] fix elseifeq and use older option core2-avx for compatibility --- kernel/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile b/kernel/Makefile index d86411d91..169c7f79c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -16,8 +16,8 @@ ifeq ($(TARGET_CORE), SKYLAKEX) override CFLAGS += -fno-asynchronous-unwind-tables endif endif -elseifeq($(TARGET_CORE), HASWELL) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=haswell +else ifeq($(TARGET_CORE), HASWELL) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=core2-avx else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif From fbcb14a74bb252ea344f5b10d3d741268326906f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Dec 2018 20:18:59 +0100 Subject: [PATCH 15/23] should be core-avx2 --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 169c7f79c..a9208619f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -17,7 +17,7 @@ ifeq ($(TARGET_CORE), SKYLAKEX) endif endif else ifeq($(TARGET_CORE), HASWELL) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=core2-avx + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=core-avx2 else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif From 3843e3e01781970690325542fe15a722f87407c6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Dec 2018 23:30:31 +0100 Subject: [PATCH 16/23] use -maxv2 on haswell --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index a9208619f..b01893175 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -17,7 +17,7 @@ ifeq ($(TARGET_CORE), SKYLAKEX) endif endif else ifeq($(TARGET_CORE), HASWELL) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=core-avx2 + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -mavx2 else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif From 69d206440ab669794201d65d4e8087060e519474 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 16 Dec 2018 00:19:41 +0000 Subject: [PATCH 17/23] Make the skylakex/haswell sgemm code compile and run even with compilers without avx2 support --- kernel/x86_64/sgemm_beta_skylakex.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index e8653112c..cdc9c44be 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -61,10 +61,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset = c; if (beta == ZERO){ -#ifdef __AVX512CD__ - __m512 z_zero = _mm512_setzero_ps(); -#endif - __m256 y_zero = _mm256_setzero_ps(); j = n; do { @@ -72,12 +68,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset += ldc; i = m; - +#ifdef __AVX2__ while (i >= 32) { #ifdef __AVX512CD__ + __m512 z_zero = _mm512_setzero_ps(); _mm512_storeu_ps(c_offset1, z_zero); _mm512_storeu_ps(c_offset1 + 16, z_zero); #else + __m256 y_zero = _mm256_setzero_ps(); _mm256_storeu_ps(c_offset1, y_zero); _mm256_storeu_ps(c_offset1 + 8, y_zero); _mm256_storeu_ps(c_offset1 + 16, y_zero); @@ -87,11 +85,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, i -= 32; } while (i >= 8) { + __m256 y_zero = _mm256_setzero_ps(); _mm256_storeu_ps(c_offset1, y_zero); c_offset1 += 8; i -= 8; } - +#endif while (i > 0) { *c_offset1 = ZERO; c_offset1 ++; From 545c2b1bbbbe9a1c548150189e54fc76e62e4b13 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Dec 2018 13:09:19 +0100 Subject: [PATCH 18/23] Add -mavx2 on Haswell only if the compiler supports it --- kernel/Makefile | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile b/kernel/Makefile index b01893175..17bfd4063 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,6 +5,27 @@ endif TOPDIR = .. include $(TOPDIR)/Makefile.system +AVX2OPT = +ifeq ($(C_COMPILER), GCC) +# AVX2 support was added in 4.7.0 + GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) + GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) + ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11) + AVX2OPT = -mavx2 + endif +endif +ifeq ($(C_COMPILER), CLANG) +# Any clang posing as gcc 4.2 should be new enough (3.4 or later) + GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) + GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2) + ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2), 11) + AVX2OPT -mavx2 + endif +endif +ifdef NO_AVX2 + AVX2OPT= +endif + ifdef TARGET_CORE ifeq ($(TARGET_CORE), SKYLAKEX) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 @@ -17,9 +38,9 @@ ifeq ($(TARGET_CORE), SKYLAKEX) endif endif else ifeq($(TARGET_CORE), HASWELL) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -mavx2 + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) else -override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif BUILD_KERNEL = 1 KDIR = From cfc4acc221344d53d72550d157c5050ddaa26ed7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Dec 2018 16:19:51 +0100 Subject: [PATCH 19/23] typo --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 17bfd4063..30292cd80 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -19,7 +19,7 @@ ifeq ($(C_COMPILER), CLANG) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2) ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2), 11) - AVX2OPT -mavx2 + AVX2OPT = -mavx2 endif endif ifdef NO_AVX2 From c4e23dd016ed2852ebf59a0d744deb55a48e66c2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Dec 2018 18:14:40 +0100 Subject: [PATCH 20/23] Update Makefile --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 30292cd80..e81225075 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -37,7 +37,7 @@ ifeq ($(TARGET_CORE), SKYLAKEX) override CFLAGS += -fno-asynchronous-unwind-tables endif endif -else ifeq($(TARGET_CORE), HASWELL) +else ifeq ($(TARGET_CORE), HASWELL) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) From c43331ad0aeaefe4b4d90aab06c93655c851feab Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 16 Dec 2018 22:59:02 +0000 Subject: [PATCH 21/23] dgemm: Use the skylakex beta function also for haswell it's more efficient for certain tall/skinny matrices --- kernel/x86_64/KERNEL.HASWELL | 1 + kernel/x86_64/dgemm_beta_skylakex.c | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 422e6c315..4cd67a705 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -45,6 +45,7 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DTRMMKERNEL = dtrmm_kernel_4x8_haswell.c DGEMMKERNEL = dgemm_kernel_4x8_haswell.S +DGEMM_BETA = dgemm_beta_skylakex.c DGEMMINCOPY = ../generic/gemm_ncopy_4.c DGEMMITCOPY = ../generic/gemm_tcopy_4.c DGEMMONCOPY = ../generic/gemm_ncopy_8.c diff --git a/kernel/x86_64/dgemm_beta_skylakex.c b/kernel/x86_64/dgemm_beta_skylakex.c index 6a824c9b5..8c24725a1 100644 --- a/kernel/x86_64/dgemm_beta_skylakex.c +++ b/kernel/x86_64/dgemm_beta_skylakex.c @@ -61,17 +61,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset = c; if (beta == ZERO){ - __m512d z_zero; - z_zero = _mm512_setzero_pd(); j = n; do { c_offset1 = c_offset; c_offset += ldc; i = m; - +#ifdef __AVX2__ +#ifdef __AVX512CD__ while (i >= 32) { + __m512d z_zero = _mm512_setzero_pd(); _mm512_storeu_pd(c_offset1, z_zero); _mm512_storeu_pd(c_offset1 + 8, z_zero); _mm512_storeu_pd(c_offset1 + 16, z_zero); @@ -79,12 +79,20 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset1 += 32; i -= 32; } +#endif while (i >= 8) { +#ifdef __AVX512CD__ + __m512d z_zero = _mm512_setzero_pd(); _mm512_storeu_pd(c_offset1, z_zero); +#else + __m256d y_zero = _mm256_setzero_pd(); + _mm256_storeu_pd(c_offset1, y_zero); + _mm256_storeu_pd(c_offset1 + 4, y_zero); +#endif c_offset1 += 8; i -= 8; } - +#endif while (i > 0) { *c_offset1 = ZERO; c_offset1 ++; From d321448a63954d536f90592cd0cc53c304b08d2e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 16 Dec 2018 23:06:58 +0000 Subject: [PATCH 22/23] dgemm: use dgemm_ncopy_8_skylakex.c also for Haswell The dgemm_ncopy_8_skylakex.c code is not avx512 specific and gives a nice performance boost for medium sized matrices --- kernel/x86_64/KERNEL.HASWELL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 4cd67a705..f98728a41 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -48,7 +48,7 @@ DGEMMKERNEL = dgemm_kernel_4x8_haswell.S DGEMM_BETA = dgemm_beta_skylakex.c DGEMMINCOPY = ../generic/gemm_ncopy_4.c DGEMMITCOPY = ../generic/gemm_tcopy_4.c -DGEMMONCOPY = ../generic/gemm_ncopy_8.c +DGEMMONCOPY = dgemm_ncopy_8_skylakex.c DGEMMOTCOPY = ../generic/gemm_tcopy_8.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) From b28f75cd7e61cf5bdcf404ebece07f75553ecde0 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 16 Dec 2018 23:08:31 +0000 Subject: [PATCH 23/23] set GEMM_PREFERED_SIZE for HASWELL Haswell likes a GEMM_PREFERED_SIZE of 16 to improve the split that the threading code does to make it a nice multiple of the SIMD kernel size --- param.h | 1 + 1 file changed, 1 insertion(+) diff --git a/param.h b/param.h index 7a18d82d7..fa6730208 100644 --- a/param.h +++ b/param.h @@ -1508,6 +1508,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 #define SWITCH_RATIO 32 +#define GEMM_PREFERED_SIZE 16 #ifdef ARCH_X86