s390x/Z14: Change register blocking for SGEMM to 16x4
Change register blocking for SGEMM (and STRMM) on z14 from 8x4 to 16x4 by adjusting SGEMM_DEFAULT_UNROLL_M and choosing the appropriate copy implementations. Actually make KERNEL.Z14 more flexible, so that the change in param.h suffices. As a result, performance for SGEMM improves by around 30% on z15. On z14, FP SIMD instructions can operate on float-sized scalars in vector registers, while z13 could do that for double-sized scalars only. Thus, we can double the amount of elements of C that are held in registers in an SGEMM kernel. Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
This commit is contained in:
parent
71b6eaf459
commit
1b0b4349a1
|
@ -92,12 +92,14 @@ CTRMMKERNEL = ctrmm4x4V.S
|
||||||
ZTRMMKERNEL = ztrmm4x4V.S
|
ZTRMMKERNEL = ztrmm4x4V.S
|
||||||
|
|
||||||
SGEMMKERNEL = gemm_vec.c
|
SGEMMKERNEL = gemm_vec.c
|
||||||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
ifneq ($(SGEMM_UNROLL_M),$(SGEMM_UNROLL_N))
|
||||||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
|
||||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
endif
|
||||||
|
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||||
|
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
|
|
@ -220,6 +220,15 @@ typedef FLOAT vector_float __attribute__ ((vector_size (16)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#if UNROLL_M == 16
|
||||||
|
VECTOR_BLOCK(16, 4)
|
||||||
|
VECTOR_BLOCK(16, 2)
|
||||||
|
VECTOR_BLOCK(16, 1)
|
||||||
|
#endif
|
||||||
|
#if UNROLL_N == 8
|
||||||
|
VECTOR_BLOCK(8, 8)
|
||||||
|
VECTOR_BLOCK(4, 8)
|
||||||
|
#endif
|
||||||
VECTOR_BLOCK(8, 4)
|
VECTOR_BLOCK(8, 4)
|
||||||
VECTOR_BLOCK(8, 2)
|
VECTOR_BLOCK(8, 2)
|
||||||
VECTOR_BLOCK(8, 1)
|
VECTOR_BLOCK(8, 1)
|
||||||
|
@ -284,6 +293,12 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n,
|
||||||
return; \
|
return; \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if UNROLL_M == 16
|
||||||
|
BLOCK(16, 4); BLOCK(16, 2); BLOCK(16, 1);
|
||||||
|
#endif
|
||||||
|
#if UNROLL_N == 8
|
||||||
|
BLOCK(8, 8); BLOCK(4, 8);
|
||||||
|
#endif
|
||||||
BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1);
|
BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1);
|
||||||
BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1);
|
BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1);
|
||||||
|
|
||||||
|
|
2
param.h
2
param.h
|
@ -2999,7 +2999,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
|
||||||
#define GEMM_DEFAULT_OFFSET_B 0
|
#define GEMM_DEFAULT_OFFSET_B 0
|
||||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_UNROLL_M 8
|
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
|
||||||
#define DGEMM_DEFAULT_UNROLL_M 8
|
#define DGEMM_DEFAULT_UNROLL_M 8
|
||||||
|
|
Loading…
Reference in New Issue