Merge pull request #3375 from martin-frbg/issue3369

Add casts to eliminate compiler warnings for Haswell sasum/dasum
This commit is contained in:
Martin Kroeker 2021-09-12 00:01:20 +02:00 committed by GitHub
commit d1ee2e9c7d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 14 additions and 14 deletions

View File

@ -38,10 +38,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
__m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff); __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff);
for (i = 0; i < tail_index_AVX2; i += 16) { for (i = 0; i < tail_index_AVX2; i += 16) {
accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask);
accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 4]), abs_mask); accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 4]), abs_mask);
accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask);
accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+12]), abs_mask); accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+12]), abs_mask);
} }
accum_0 = accum_0 + accum_1 + accum_2 + accum_3; accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
@ -63,10 +63,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
__m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff);
for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) {
accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2);
accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 2]), abs_mask2);
accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2);
accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 6]), abs_mask2);
} }
accum_20 = accum_20 + accum_21 + accum_22 + accum_23; accum_20 = accum_20 + accum_21 + accum_22 + accum_23;

View File

@ -38,10 +38,10 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
__m256i abs_mask = _mm256_set1_epi32(0x7fffffff); __m256i abs_mask = _mm256_set1_epi32(0x7fffffff);
for (i = 0; i < tail_index_AVX2; i += 32) { for (i = 0; i < tail_index_AVX2; i += 32) {
accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask);
accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask);
accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+16]), abs_mask); accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+16]), abs_mask);
accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+24]), abs_mask); accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+24]), abs_mask);
} }
accum_0 = accum_0 + accum_1 + accum_2 + accum_3; accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
@ -62,8 +62,8 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
__m128i abs_mask2 = _mm_set1_epi32(0x7fffffff); __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff);
for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) {
accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2);
accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2);
} }
accum_20 += accum_21; accum_20 += accum_21;