Merge pull request #2623 from mhillenibm/zarch_dgemm_z14
s390x: Use new sgemm kernel also for DGEMM and DTRMM on Z14 (+ small cleanup)
This commit is contained in:
commit
729ac6bd4a
|
@ -87,7 +87,7 @@ CGEMVTKERNEL = cgemv_t_4.c
|
||||||
ZGEMVTKERNEL = zgemv_t_4.c
|
ZGEMVTKERNEL = zgemv_t_4.c
|
||||||
|
|
||||||
STRMMKERNEL = gemm_vec.c
|
STRMMKERNEL = gemm_vec.c
|
||||||
DTRMMKERNEL = trmm8x4V.S
|
DTRMMKERNEL = gemm_vec.c
|
||||||
CTRMMKERNEL = ctrmm4x4V.S
|
CTRMMKERNEL = ctrmm4x4V.S
|
||||||
ZTRMMKERNEL = ztrmm4x4V.S
|
ZTRMMKERNEL = ztrmm4x4V.S
|
||||||
|
|
||||||
|
@ -103,7 +103,7 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
DGEMMKERNEL = gemm8x4V.S
|
DGEMMKERNEL = gemm_vec.c
|
||||||
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||||
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||||
|
|
|
@ -203,9 +203,12 @@ static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
|
||||||
"rows in block must be multiples of vector length"); \
|
"rows in block must be multiples of vector length"); \
|
||||||
vector_float Caux[ROWS / VLEN_FLOATS][COLS]; \
|
vector_float Caux[ROWS / VLEN_FLOATS][COLS]; \
|
||||||
\
|
\
|
||||||
for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) \
|
for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \
|
||||||
|
vector_float A0 = \
|
||||||
|
vec_load_hinted(A + i * VLEN_FLOATS); \
|
||||||
for (BLASLONG j = 0; j < COLS; j++) \
|
for (BLASLONG j = 0; j < COLS; j++) \
|
||||||
Caux[i][j] = vec_splats(ZERO); \
|
Caux[i][j] = A0 * B[j]; \
|
||||||
|
} \
|
||||||
\
|
\
|
||||||
/* \
|
/* \
|
||||||
* Stream over the row-block of A, which is packed \
|
* Stream over the row-block of A, which is packed \
|
||||||
|
@ -216,7 +219,7 @@ static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
|
||||||
* That equates to unrolling the loop over rows (in i) and \
|
* That equates to unrolling the loop over rows (in i) and \
|
||||||
* executing each unrolled iteration as a vector element. \
|
* executing each unrolled iteration as a vector element. \
|
||||||
*/ \
|
*/ \
|
||||||
for (BLASLONG k = 0; k < bk; k++) { \
|
for (BLASLONG k = 1; k < bk; k++) { \
|
||||||
for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \
|
for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \
|
||||||
vector_float Ak = vec_load_hinted( \
|
vector_float Ak = vec_load_hinted( \
|
||||||
A + i * VLEN_FLOATS + k * ROWS); \
|
A + i * VLEN_FLOATS + k * ROWS); \
|
||||||
|
|
Loading…
Reference in New Issue