From bdd795ed03667861b762836aa64e4b2bd33bf485 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 19 May 2020 14:30:44 +0200 Subject: [PATCH] s390x/GEMM: replace 0-init with peeled first iteration ... since it gains another ~2% of SGEMM and DGEMM performance on z15; also, the code just called for that cleanup. Signed-off-by: Marius Hillenbrand --- kernel/zarch/gemm_vec.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c index 2d4457f06..eb6d7700b 100644 --- a/kernel/zarch/gemm_vec.c +++ b/kernel/zarch/gemm_vec.c @@ -203,9 +203,12 @@ static inline vector_float vec_load_hinted(FLOAT const *restrict a) { "rows in block must be multiples of vector length"); \ vector_float Caux[ROWS / VLEN_FLOATS][COLS]; \ \ - for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) \ + for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \ + vector_float A0 = \ + vec_load_hinted(A + i * VLEN_FLOATS); \ for (BLASLONG j = 0; j < COLS; j++) \ - Caux[i][j] = vec_splats(ZERO); \ + Caux[i][j] = A0 * B[j]; \ + } \ \ /* \ * Stream over the row-block of A, which is packed \ @@ -216,7 +219,7 @@ static inline vector_float vec_load_hinted(FLOAT const *restrict a) { * That equates to unrolling the loop over rows (in i) and \ * executing each unrolled iteration as a vector element. \ */ \ - for (BLASLONG k = 0; k < bk; k++) { \ + for (BLASLONG k = 1; k < bk; k++) { \ for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \ vector_float Ak = vec_load_hinted( \ A + i * VLEN_FLOATS + k * ROWS); \