s390x/Z14: Change register blocking for SGEMM to 16x4
Change register blocking for SGEMM (and STRMM) on z14 from 8x4 to 16x4 by adjusting SGEMM_DEFAULT_UNROLL_M and choosing the appropriate copy implementations. Actually make KERNEL.Z14 more flexible, so that the change in param.h suffices. As a result, performance for SGEMM improves by around 30% on z15. On z14, FP SIMD instructions can operate on float-sized scalars in vector registers, while z13 could do that for double-sized scalars only. Thus, we can double the amount of elements of C that are held in registers in an SGEMM kernel. Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
This commit is contained in:
parent
71b6eaf459
commit
1b0b4349a1
|
@ -92,12 +92,14 @@ CTRMMKERNEL = ctrmm4x4V.S
|
|||
ZTRMMKERNEL = ztrmm4x4V.S
|
||||
|
||||
SGEMMKERNEL = gemm_vec.c
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
ifneq ($(SGEMM_UNROLL_M),$(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
|
|
|
@ -220,6 +220,15 @@ typedef FLOAT vector_float __attribute__ ((vector_size (16)));
|
|||
}
|
||||
|
||||
|
||||
#if UNROLL_M == 16
|
||||
VECTOR_BLOCK(16, 4)
|
||||
VECTOR_BLOCK(16, 2)
|
||||
VECTOR_BLOCK(16, 1)
|
||||
#endif
|
||||
#if UNROLL_N == 8
|
||||
VECTOR_BLOCK(8, 8)
|
||||
VECTOR_BLOCK(4, 8)
|
||||
#endif
|
||||
VECTOR_BLOCK(8, 4)
|
||||
VECTOR_BLOCK(8, 2)
|
||||
VECTOR_BLOCK(8, 1)
|
||||
|
@ -284,6 +293,12 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n,
|
|||
return; \
|
||||
}
|
||||
|
||||
#if UNROLL_M == 16
|
||||
BLOCK(16, 4); BLOCK(16, 2); BLOCK(16, 1);
|
||||
#endif
|
||||
#if UNROLL_N == 8
|
||||
BLOCK(8, 8); BLOCK(4, 8);
|
||||
#endif
|
||||
BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1);
|
||||
BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1);
|
||||
|
||||
|
|
2
param.h
2
param.h
|
@ -2999,7 +2999,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
|
|||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 8
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define DGEMM_DEFAULT_UNROLL_M 8
|
||||
|
|
Loading…
Reference in New Issue