Use sgemm_ncopy_4_skylakex.c also for Haswell
sgemm_ncopy_4_skylakex.c uses SSE transpose operations where the real perf win happens; this also works great for Haswell. This gives double digit percentage gains on small and skinny matrices
This commit is contained in:
parent
00dc09ad19
commit
0586899a10
|
@ -36,7 +36,7 @@ SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
|
|||
SGEMM_BETA = sgemm_beta_skylakex.c
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMONCOPY = sgemm_ncopy_4_skylakex.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
|
|
@ -49,8 +49,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __
|
|||
FLOAT *b_offset;
|
||||
FLOAT ctemp1, ctemp2, ctemp3, ctemp4;
|
||||
FLOAT ctemp5, ctemp6, ctemp7, ctemp8;
|
||||
FLOAT ctemp9, ctemp10, ctemp11, ctemp12;
|
||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
FLOAT ctemp9, ctemp13;
|
||||
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
|
Loading…
Reference in New Issue