Use sgemm_ncopy_4_skylakex.c also for Haswell

sgemm_ncopy_4_skylakex.c uses SSE transpose operations where the real perf win happens; this also works great for Haswell. This gives double digit percentage gains on small and skinny matrices
2018-12-15 13:43:07 +00:00 · 2018-12-15 13:43:07 +00:00 · 0586899a10
parent 00dc09ad19
commit 0586899a10
2 changed files with 2 additions and 3 deletions
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@ -36,7 +36,7 @@ SGEMMKERNEL    =  sgemm_kernel_16x4_haswell.S
 SGEMM_BETA     =  sgemm_beta_skylakex.c
 SGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
 SGEMMITCOPY    =  ../generic/gemm_tcopy_16.c
-SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
+SGEMMONCOPY    =  sgemm_ncopy_4_skylakex.c
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
 SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
--- a/kernel/x86_64/sgemm_ncopy_4_skylakex.c
+++ b/kernel/x86_64/sgemm_ncopy_4_skylakex.c
@ -49,8 +49,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __
  FLOAT *b_offset;
  FLOAT  ctemp1,  ctemp2,  ctemp3,  ctemp4;
  FLOAT  ctemp5,  ctemp6,  ctemp7,  ctemp8;
-  FLOAT  ctemp9, ctemp10, ctemp11, ctemp12;
-  FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
+  FLOAT  ctemp9,  ctemp13;

  a_offset = a;
  b_offset = b;