s390x/GEMM: replace 0-init with peeled first iteration
... since it gains another ~2% of SGEMM and DGEMM performance on z15; also, the code just called for that cleanup. Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
This commit is contained in:
parent
e1038ea836
commit
bdd795ed03
|
@ -203,9 +203,12 @@ static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
|
|||
"rows in block must be multiples of vector length"); \
|
||||
vector_float Caux[ROWS / VLEN_FLOATS][COLS]; \
|
||||
\
|
||||
for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) \
|
||||
for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \
|
||||
vector_float A0 = \
|
||||
vec_load_hinted(A + i * VLEN_FLOATS); \
|
||||
for (BLASLONG j = 0; j < COLS; j++) \
|
||||
Caux[i][j] = vec_splats(ZERO); \
|
||||
Caux[i][j] = A0 * B[j]; \
|
||||
} \
|
||||
\
|
||||
/* \
|
||||
* Stream over the row-block of A, which is packed \
|
||||
|
@ -216,7 +219,7 @@ static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
|
|||
* That equates to unrolling the loop over rows (in i) and \
|
||||
* executing each unrolled iteration as a vector element. \
|
||||
*/ \
|
||||
for (BLASLONG k = 0; k < bk; k++) { \
|
||||
for (BLASLONG k = 1; k < bk; k++) { \
|
||||
for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \
|
||||
vector_float Ak = vec_load_hinted( \
|
||||
A + i * VLEN_FLOATS + k * ROWS); \
|
||||
|
|
Loading…
Reference in New Issue