From 1b0b4349a11f8de40037d9bddf9ddb9b094cdd2c Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 12 May 2020 15:06:38 +0200 Subject: [PATCH] s390x/Z14: Change register blocking for SGEMM to 16x4 Change register blocking for SGEMM (and STRMM) on z14 from 8x4 to 16x4 by adjusting SGEMM_DEFAULT_UNROLL_M and choosing the appropriate copy implementations. Actually make KERNEL.Z14 more flexible, so that the change in param.h suffices. As a result, performance for SGEMM improves by around 30% on z15. On z14, FP SIMD instructions can operate on float-sized scalars in vector registers, while z13 could do that for double-sized scalars only. Thus, we can double the amount of elements of C that are held in registers in an SGEMM kernel. Signed-off-by: Marius Hillenbrand --- kernel/zarch/KERNEL.Z14 | 10 ++++++---- kernel/zarch/gemm_vec.c | 15 +++++++++++++++ param.h | 2 +- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14 index 49fa28175..96e6745fd 100644 --- a/kernel/zarch/KERNEL.Z14 +++ b/kernel/zarch/KERNEL.Z14 @@ -92,12 +92,14 @@ CTRMMKERNEL = ctrmm4x4V.S ZTRMMKERNEL = ztrmm4x4V.S SGEMMKERNEL = gemm_vec.c -SGEMMINCOPY = ../generic/gemm_ncopy_8.c -SGEMMITCOPY = ../generic/gemm_tcopy_8.c -SGEMMONCOPY = ../generic/gemm_ncopy_4.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +ifneq ($(SGEMM_UNROLL_M),$(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c index a9531c7a5..4e1b3e3fb 100644 --- a/kernel/zarch/gemm_vec.c +++ b/kernel/zarch/gemm_vec.c @@ -220,6 +220,15 @@ typedef FLOAT vector_float __attribute__ ((vector_size (16))); } +#if UNROLL_M == 16 +VECTOR_BLOCK(16, 4) +VECTOR_BLOCK(16, 2) +VECTOR_BLOCK(16, 1) +#endif +#if UNROLL_N == 8 +VECTOR_BLOCK(8, 8) +VECTOR_BLOCK(4, 8) +#endif VECTOR_BLOCK(8, 4) VECTOR_BLOCK(8, 2) VECTOR_BLOCK(8, 1) @@ -284,6 +293,12 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n, return; \ } +#if UNROLL_M == 16 + BLOCK(16, 4); BLOCK(16, 2); BLOCK(16, 1); +#endif +#if UNROLL_N == 8 + BLOCK(8, 8); BLOCK(4, 8); +#endif BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1); BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1); diff --git a/param.h b/param.h index 7094249e8..6f0a3b727 100644 --- a/param.h +++ b/param.h @@ -2999,7 +2999,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 8