s390x/Z14: Change register blocking for SGEMM to 16x4

Change register blocking for SGEMM (and STRMM) on z14 from 8x4 to 16x4 by adjusting SGEMM_DEFAULT_UNROLL_M and choosing the appropriate copy implementations. Actually make KERNEL.Z14 more flexible, so that the change in param.h suffices. As a result, performance for SGEMM improves by around 30% on z15. On z14, FP SIMD instructions can operate on float-sized scalars in vector registers, while z13 could do that for double-sized scalars only. Thus, we can double the amount of elements of C that are held in registers in an SGEMM kernel. Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-05-12 15:06:38 +02:00 · 2020-05-12 15:06:38 +02:00 · 1b0b4349a1
parent 71b6eaf459
commit 1b0b4349a1
3 changed files with 22 additions and 5 deletions
--- a/kernel/zarch/KERNEL.Z14
+++ b/kernel/zarch/KERNEL.Z14
@ -92,12 +92,14 @@ CTRMMKERNEL	= ctrmm4x4V.S
 ZTRMMKERNEL	= ztrmm4x4V.S
 SGEMMKERNEL    = gemm_vec.c
-SGEMMINCOPY    = ../generic/gemm_ncopy_8.c
+ifneq ($(SGEMM_UNROLL_M),$(SGEMM_UNROLL_N))
-SGEMMITCOPY    = ../generic/gemm_tcopy_8.c
+SGEMMINCOPY    = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
-SGEMMONCOPY    = ../generic/gemm_ncopy_4.c
+SGEMMITCOPY    = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
 SGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
 SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 SGEMMONCOPY    = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
 SGEMMOTCOPY    = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
 SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
--- a/kernel/zarch/gemm_vec.c
+++ b/kernel/zarch/gemm_vec.c
@ -220,6 +220,15 @@ typedef FLOAT vector_float __attribute__ ((vector_size (16)));
 	}
 #if UNROLL_M == 16
 VECTOR_BLOCK(16, 4)
 VECTOR_BLOCK(16, 2)
 VECTOR_BLOCK(16, 1)
 #endif
 #if UNROLL_N == 8
 VECTOR_BLOCK(8, 8)
 VECTOR_BLOCK(4, 8)
 #endif
 VECTOR_BLOCK(8, 4)
 VECTOR_BLOCK(8, 2)
 VECTOR_BLOCK(8, 1)
@ -284,6 +293,12 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n,
 		return;                                         \
 	}
 #if UNROLL_M == 16
 	BLOCK(16, 4); BLOCK(16, 2); BLOCK(16, 1);
 #endif
 #if UNROLL_N == 8
 	BLOCK(8, 8); BLOCK(4, 8);
 #endif
 	BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1);
 	BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1);
--- a/param.h
+++ b/param.h
@ -2999,7 +2999,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define GEMM_DEFAULT_OFFSET_B 0
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
-#define SGEMM_DEFAULT_UNROLL_M  8
+#define SGEMM_DEFAULT_UNROLL_M  16
 #define SGEMM_DEFAULT_UNROLL_N  4
 #define DGEMM_DEFAULT_UNROLL_M  8