From 00dc09ad198aedec53fd05ea1b13d72d7a9a517a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 15 Dec 2018 13:18:59 +0000 Subject: [PATCH 01/10] Use the skylake sgemm beta code also for haswell with a few small changes it's possible to use the skylake sgemm code also for haswell, this gives a modest gain (10% range) for smallish matrixes but does wonders for very skinny matrixes --- kernel/x86_64/KERNEL.HASWELL | 1 + kernel/x86_64/sgemm_beta_skylakex.c | 15 +++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 848de38df..2aec60064 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -33,6 +33,7 @@ ZAXPYKERNEL = zaxpy.c STRMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMKERNEL = sgemm_kernel_16x4_haswell.S +SGEMM_BETA = sgemm_beta_skylakex.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index 498c46f0d..e8653112c 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -61,11 +61,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset = c; if (beta == ZERO){ - __m512 z_zero; - __m256 y_zero; +#ifdef __AVX512CD__ + __m512 z_zero = _mm512_setzero_ps(); +#endif + __m256 y_zero = _mm256_setzero_ps(); - z_zero = _mm512_setzero_ps(); - y_zero = _mm256_setzero_ps(); j = n; do { c_offset1 = c_offset; @@ -74,8 +74,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, i = m; while (i >= 32) { +#ifdef __AVX512CD__ _mm512_storeu_ps(c_offset1, z_zero); _mm512_storeu_ps(c_offset1 + 16, z_zero); +#else + _mm256_storeu_ps(c_offset1, y_zero); + _mm256_storeu_ps(c_offset1 + 8, y_zero); + _mm256_storeu_ps(c_offset1 + 16, y_zero); + _mm256_storeu_ps(c_offset1 + 24, y_zero); +#endif c_offset1 += 32; i -= 32; } From 0586899a10b97bf1baf50e4988d18b4268317420 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 15 Dec 2018 13:43:07 +0000 Subject: [PATCH 02/10] Use sgemm_ncopy_4_skylakex.c also for Haswell sgemm_ncopy_4_skylakex.c uses SSE transpose operations where the real perf win happens; this also works great for Haswell. This gives double digit percentage gains on small and skinny matrices --- kernel/x86_64/KERNEL.HASWELL | 2 +- kernel/x86_64/sgemm_ncopy_4_skylakex.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 2aec60064..422e6c315 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -36,7 +36,7 @@ SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMM_BETA = sgemm_beta_skylakex.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c -SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMONCOPY = sgemm_ncopy_4_skylakex.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/x86_64/sgemm_ncopy_4_skylakex.c b/kernel/x86_64/sgemm_ncopy_4_skylakex.c index 8577e3b38..6b2b0f5b1 100644 --- a/kernel/x86_64/sgemm_ncopy_4_skylakex.c +++ b/kernel/x86_64/sgemm_ncopy_4_skylakex.c @@ -49,8 +49,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __ FLOAT *b_offset; FLOAT ctemp1, ctemp2, ctemp3, ctemp4; FLOAT ctemp5, ctemp6, ctemp7, ctemp8; - FLOAT ctemp9, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp9, ctemp13; a_offset = a; b_offset = b; From 1ebe5c0f499575d42e85b4f89e4205882be8ebe3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Dec 2018 19:35:35 +0100 Subject: [PATCH 03/10] Add -march=haswell to HASWELL part of DYNAMIC_ARCH build --- kernel/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/Makefile b/kernel/Makefile index a441bde7c..d86411d91 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -16,6 +16,8 @@ ifeq ($(TARGET_CORE), SKYLAKEX) override CFLAGS += -fno-asynchronous-unwind-tables endif endif +elseifeq($(TARGET_CORE), HASWELL) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=haswell else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif From 2a3190dc76a3eb60fabe298b1df04c46cdca5350 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Dec 2018 20:17:44 +0100 Subject: [PATCH 04/10] fix elseifeq and use older option core2-avx for compatibility --- kernel/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile b/kernel/Makefile index d86411d91..169c7f79c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -16,8 +16,8 @@ ifeq ($(TARGET_CORE), SKYLAKEX) override CFLAGS += -fno-asynchronous-unwind-tables endif endif -elseifeq($(TARGET_CORE), HASWELL) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=haswell +else ifeq($(TARGET_CORE), HASWELL) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=core2-avx else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif From fbcb14a74bb252ea344f5b10d3d741268326906f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Dec 2018 20:18:59 +0100 Subject: [PATCH 05/10] should be core-avx2 --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 169c7f79c..a9208619f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -17,7 +17,7 @@ ifeq ($(TARGET_CORE), SKYLAKEX) endif endif else ifeq($(TARGET_CORE), HASWELL) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=core2-avx + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=core-avx2 else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif From 3843e3e01781970690325542fe15a722f87407c6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Dec 2018 23:30:31 +0100 Subject: [PATCH 06/10] use -maxv2 on haswell --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index a9208619f..b01893175 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -17,7 +17,7 @@ ifeq ($(TARGET_CORE), SKYLAKEX) endif endif else ifeq($(TARGET_CORE), HASWELL) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=core-avx2 + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -mavx2 else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif From 69d206440ab669794201d65d4e8087060e519474 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 16 Dec 2018 00:19:41 +0000 Subject: [PATCH 07/10] Make the skylakex/haswell sgemm code compile and run even with compilers without avx2 support --- kernel/x86_64/sgemm_beta_skylakex.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index e8653112c..cdc9c44be 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -61,10 +61,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset = c; if (beta == ZERO){ -#ifdef __AVX512CD__ - __m512 z_zero = _mm512_setzero_ps(); -#endif - __m256 y_zero = _mm256_setzero_ps(); j = n; do { @@ -72,12 +68,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset += ldc; i = m; - +#ifdef __AVX2__ while (i >= 32) { #ifdef __AVX512CD__ + __m512 z_zero = _mm512_setzero_ps(); _mm512_storeu_ps(c_offset1, z_zero); _mm512_storeu_ps(c_offset1 + 16, z_zero); #else + __m256 y_zero = _mm256_setzero_ps(); _mm256_storeu_ps(c_offset1, y_zero); _mm256_storeu_ps(c_offset1 + 8, y_zero); _mm256_storeu_ps(c_offset1 + 16, y_zero); @@ -87,11 +85,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, i -= 32; } while (i >= 8) { + __m256 y_zero = _mm256_setzero_ps(); _mm256_storeu_ps(c_offset1, y_zero); c_offset1 += 8; i -= 8; } - +#endif while (i > 0) { *c_offset1 = ZERO; c_offset1 ++; From 545c2b1bbbbe9a1c548150189e54fc76e62e4b13 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Dec 2018 13:09:19 +0100 Subject: [PATCH 08/10] Add -mavx2 on Haswell only if the compiler supports it --- kernel/Makefile | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile b/kernel/Makefile index b01893175..17bfd4063 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,6 +5,27 @@ endif TOPDIR = .. include $(TOPDIR)/Makefile.system +AVX2OPT = +ifeq ($(C_COMPILER), GCC) +# AVX2 support was added in 4.7.0 + GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) + GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) + ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11) + AVX2OPT = -mavx2 + endif +endif +ifeq ($(C_COMPILER), CLANG) +# Any clang posing as gcc 4.2 should be new enough (3.4 or later) + GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) + GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2) + ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2), 11) + AVX2OPT -mavx2 + endif +endif +ifdef NO_AVX2 + AVX2OPT= +endif + ifdef TARGET_CORE ifeq ($(TARGET_CORE), SKYLAKEX) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 @@ -17,9 +38,9 @@ ifeq ($(TARGET_CORE), SKYLAKEX) endif endif else ifeq($(TARGET_CORE), HASWELL) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -mavx2 + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) else -override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif BUILD_KERNEL = 1 KDIR = From cfc4acc221344d53d72550d157c5050ddaa26ed7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Dec 2018 16:19:51 +0100 Subject: [PATCH 09/10] typo --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 17bfd4063..30292cd80 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -19,7 +19,7 @@ ifeq ($(C_COMPILER), CLANG) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2) ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2), 11) - AVX2OPT -mavx2 + AVX2OPT = -mavx2 endif endif ifdef NO_AVX2 From c4e23dd016ed2852ebf59a0d744deb55a48e66c2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Dec 2018 18:14:40 +0100 Subject: [PATCH 10/10] Update Makefile --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 30292cd80..e81225075 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -37,7 +37,7 @@ ifeq ($(TARGET_CORE), SKYLAKEX) override CFLAGS += -fno-asynchronous-unwind-tables endif endif -else ifeq($(TARGET_CORE), HASWELL) +else ifeq ($(TARGET_CORE), HASWELL) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)