Merge pull request #4330 from bartoldeman/asum-init-mask

Use _mm_set1_epi{32,64x} to init mask in x86-64 [cz]asum
This commit is contained in:
Martin Kroeker 2023-11-20 05:38:39 +01:00 committed by GitHub
commit 2ea65bacd0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 8 additions and 13 deletions

View File

@ -2,10 +2,9 @@
#ifdef __NVCOMPILER
#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
#endif
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && (__clang_major__ >= 9 &&__clang_major__ !=17)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309)))
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203))
#if (!(defined(__NVCOMPILER) ))
//&& NVCOMPVERS < 2309))
#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203))
#define HAVE_CASUM_KERNEL 1
@ -21,15 +20,14 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
if (n2 < 64) {
__m128 accum_10, accum_11, accum_12, accum_13;
__m128 abs_mask1 = abs_mask1;
__m128 abs_mask1;
accum_10 = _mm_setzero_ps();
accum_11 = _mm_setzero_ps();
accum_12 = _mm_setzero_ps();
accum_13 = _mm_setzero_ps();
abs_mask1 = (__m128)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
abs_mask1 = (__m128)_mm_srli_epi32((__m128i) abs_mask1, 1);
abs_mask1 = (__m128)_mm_set1_epi32(0x7fffffff);
_mm_prefetch(&x1[0], _MM_HINT_T0);

View File

@ -2,10 +2,9 @@
#ifdef __NVCOMPILER
#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
#endif
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && ( __clang_major__ >= 9 && __clang_major__ != 17)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309)))
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203))
#if (!(defined(__NVCOMPILER) ))
//&& NVCOMPVERS < 2309))
#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203))
#define HAVE_ZASUM_KERNEL 1
@ -22,16 +21,14 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
if (n2 < 32) {
__m128d accum_10, accum_11, accum_12, accum_13;
__m128d abs_mask1 = abs_mask1;
__m128d abs_mask1;
accum_10 = _mm_setzero_pd();
accum_11 = _mm_setzero_pd();
accum_12 = _mm_setzero_pd();
accum_13 = _mm_setzero_pd();
// abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff);
abs_mask1 = (__m128d)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
abs_mask1 = (__m128d)_mm_srli_epi64((__m128i) abs_mask1, 1);
abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff);
_mm_prefetch(&x1[0], _MM_HINT_T0);
if (n2 >= 16){