From c43331ad0aeaefe4b4d90aab06c93655c851feab Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 16 Dec 2018 22:59:02 +0000 Subject: [PATCH 1/3] dgemm: Use the skylakex beta function also for haswell it's more efficient for certain tall/skinny matrices --- kernel/x86_64/KERNEL.HASWELL | 1 + kernel/x86_64/dgemm_beta_skylakex.c | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 422e6c315..4cd67a705 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -45,6 +45,7 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DTRMMKERNEL = dtrmm_kernel_4x8_haswell.c DGEMMKERNEL = dgemm_kernel_4x8_haswell.S +DGEMM_BETA = dgemm_beta_skylakex.c DGEMMINCOPY = ../generic/gemm_ncopy_4.c DGEMMITCOPY = ../generic/gemm_tcopy_4.c DGEMMONCOPY = ../generic/gemm_ncopy_8.c diff --git a/kernel/x86_64/dgemm_beta_skylakex.c b/kernel/x86_64/dgemm_beta_skylakex.c index 6a824c9b5..8c24725a1 100644 --- a/kernel/x86_64/dgemm_beta_skylakex.c +++ b/kernel/x86_64/dgemm_beta_skylakex.c @@ -61,17 +61,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset = c; if (beta == ZERO){ - __m512d z_zero; - z_zero = _mm512_setzero_pd(); j = n; do { c_offset1 = c_offset; c_offset += ldc; i = m; - +#ifdef __AVX2__ +#ifdef __AVX512CD__ while (i >= 32) { + __m512d z_zero = _mm512_setzero_pd(); _mm512_storeu_pd(c_offset1, z_zero); _mm512_storeu_pd(c_offset1 + 8, z_zero); _mm512_storeu_pd(c_offset1 + 16, z_zero); @@ -79,12 +79,20 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset1 += 32; i -= 32; } +#endif while (i >= 8) { +#ifdef __AVX512CD__ + __m512d z_zero = _mm512_setzero_pd(); _mm512_storeu_pd(c_offset1, z_zero); +#else + __m256d y_zero = _mm256_setzero_pd(); + _mm256_storeu_pd(c_offset1, y_zero); + _mm256_storeu_pd(c_offset1 + 4, y_zero); +#endif c_offset1 += 8; i -= 8; } - +#endif while (i > 0) { *c_offset1 = ZERO; c_offset1 ++; From d321448a63954d536f90592cd0cc53c304b08d2e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 16 Dec 2018 23:06:58 +0000 Subject: [PATCH 2/3] dgemm: use dgemm_ncopy_8_skylakex.c also for Haswell The dgemm_ncopy_8_skylakex.c code is not avx512 specific and gives a nice performance boost for medium sized matrices --- kernel/x86_64/KERNEL.HASWELL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 4cd67a705..f98728a41 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -48,7 +48,7 @@ DGEMMKERNEL = dgemm_kernel_4x8_haswell.S DGEMM_BETA = dgemm_beta_skylakex.c DGEMMINCOPY = ../generic/gemm_ncopy_4.c DGEMMITCOPY = ../generic/gemm_tcopy_4.c -DGEMMONCOPY = ../generic/gemm_ncopy_8.c +DGEMMONCOPY = dgemm_ncopy_8_skylakex.c DGEMMOTCOPY = ../generic/gemm_tcopy_8.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) From b28f75cd7e61cf5bdcf404ebece07f75553ecde0 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 16 Dec 2018 23:08:31 +0000 Subject: [PATCH 3/3] set GEMM_PREFERED_SIZE for HASWELL Haswell likes a GEMM_PREFERED_SIZE of 16 to improve the split that the threading code does to make it a nice multiple of the SIMD kernel size --- param.h | 1 + 1 file changed, 1 insertion(+) diff --git a/param.h b/param.h index 7a18d82d7..fa6730208 100644 --- a/param.h +++ b/param.h @@ -1508,6 +1508,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 #define SWITCH_RATIO 32 +#define GEMM_PREFERED_SIZE 16 #ifdef ARCH_X86