Use the skylake sgemm beta code also for haswell
with a few small changes it's possible to use the skylake sgemm code also for haswell, this gives a modest gain (10% range) for smallish matrixes but does wonders for very skinny matrixes
This commit is contained in:
parent
78d877b54b
commit
00dc09ad19
|
@ -33,6 +33,7 @@ ZAXPYKERNEL = zaxpy.c
|
|||
|
||||
STRMMKERNEL = sgemm_kernel_16x4_haswell.S
|
||||
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
|
||||
SGEMM_BETA = sgemm_beta_skylakex.c
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
|
|
|
@ -61,11 +61,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
|
|||
c_offset = c;
|
||||
|
||||
if (beta == ZERO){
|
||||
__m512 z_zero;
|
||||
__m256 y_zero;
|
||||
#ifdef __AVX512CD__
|
||||
__m512 z_zero = _mm512_setzero_ps();
|
||||
#endif
|
||||
__m256 y_zero = _mm256_setzero_ps();
|
||||
|
||||
z_zero = _mm512_setzero_ps();
|
||||
y_zero = _mm256_setzero_ps();
|
||||
j = n;
|
||||
do {
|
||||
c_offset1 = c_offset;
|
||||
|
@ -74,8 +74,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
|
|||
i = m;
|
||||
|
||||
while (i >= 32) {
|
||||
#ifdef __AVX512CD__
|
||||
_mm512_storeu_ps(c_offset1, z_zero);
|
||||
_mm512_storeu_ps(c_offset1 + 16, z_zero);
|
||||
#else
|
||||
_mm256_storeu_ps(c_offset1, y_zero);
|
||||
_mm256_storeu_ps(c_offset1 + 8, y_zero);
|
||||
_mm256_storeu_ps(c_offset1 + 16, y_zero);
|
||||
_mm256_storeu_ps(c_offset1 + 24, y_zero);
|
||||
#endif
|
||||
c_offset1 += 32;
|
||||
i -= 32;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue