Make the skylakex/haswell sgemm code compile and run even with compilers without avx2 support

This commit is contained in:
Arjan van de Ven 2018-12-16 00:19:41 +00:00
parent 3843e3e017
commit 69d206440a
1 changed files with 5 additions and 6 deletions

View File

@ -61,10 +61,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
c_offset = c; c_offset = c;
if (beta == ZERO){ if (beta == ZERO){
#ifdef __AVX512CD__
__m512 z_zero = _mm512_setzero_ps();
#endif
__m256 y_zero = _mm256_setzero_ps();
j = n; j = n;
do { do {
@ -72,12 +68,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
c_offset += ldc; c_offset += ldc;
i = m; i = m;
#ifdef __AVX2__
while (i >= 32) { while (i >= 32) {
#ifdef __AVX512CD__ #ifdef __AVX512CD__
__m512 z_zero = _mm512_setzero_ps();
_mm512_storeu_ps(c_offset1, z_zero); _mm512_storeu_ps(c_offset1, z_zero);
_mm512_storeu_ps(c_offset1 + 16, z_zero); _mm512_storeu_ps(c_offset1 + 16, z_zero);
#else #else
__m256 y_zero = _mm256_setzero_ps();
_mm256_storeu_ps(c_offset1, y_zero); _mm256_storeu_ps(c_offset1, y_zero);
_mm256_storeu_ps(c_offset1 + 8, y_zero); _mm256_storeu_ps(c_offset1 + 8, y_zero);
_mm256_storeu_ps(c_offset1 + 16, y_zero); _mm256_storeu_ps(c_offset1 + 16, y_zero);
@ -87,11 +85,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
i -= 32; i -= 32;
} }
while (i >= 8) { while (i >= 8) {
__m256 y_zero = _mm256_setzero_ps();
_mm256_storeu_ps(c_offset1, y_zero); _mm256_storeu_ps(c_offset1, y_zero);
c_offset1 += 8; c_offset1 += 8;
i -= 8; i -= 8;
} }
#endif
while (i > 0) { while (i > 0) {
*c_offset1 = ZERO; *c_offset1 = ZERO;
c_offset1 ++; c_offset1 ++;