Use sgemm_ncopy_4_skylakex.c also for Haswell

sgemm_ncopy_4_skylakex.c uses SSE transpose operations where the
real perf win happens; this also works great for Haswell.

This gives double digit percentage gains on small and skinny matrices
This commit is contained in:
Arjan van de Ven 2018-12-15 13:43:07 +00:00
parent 00dc09ad19
commit 0586899a10
2 changed files with 2 additions and 3 deletions

View File

@ -36,7 +36,7 @@ SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
SGEMM_BETA = sgemm_beta_skylakex.c
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMONCOPY = sgemm_ncopy_4_skylakex.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)

View File

@ -49,8 +49,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __
FLOAT *b_offset;
FLOAT ctemp1, ctemp2, ctemp3, ctemp4;
FLOAT ctemp5, ctemp6, ctemp7, ctemp8;
FLOAT ctemp9, ctemp10, ctemp11, ctemp12;
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
FLOAT ctemp9, ctemp13;
a_offset = a;
b_offset = b;