From d17238599b573350b166973619039e67fba12fdd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 Sep 2021 13:38:28 +0200 Subject: [PATCH] Add casts --- kernel/x86_64/dasum_microk_haswell-2.c | 16 ++++++++-------- kernel/x86_64/sasum_microk_haswell-2.c | 12 ++++++------ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c index 4fc73ddd4..fd9da7ebe 100644 --- a/kernel/x86_64/dasum_microk_haswell-2.c +++ b/kernel/x86_64/dasum_microk_haswell-2.c @@ -38,10 +38,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff); for (i = 0; i < tail_index_AVX2; i += 16) { - accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); - accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 4]), abs_mask); - accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); - accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+12]), abs_mask); + accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask); + accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 4]), abs_mask); + accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask); + accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+12]), abs_mask); } accum_0 = accum_0 + accum_1 + accum_2 + accum_3; @@ -63,10 +63,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { - accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); - accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); - accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); - accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); + accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); + accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 2]), abs_mask2); + accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); + accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 6]), abs_mask2); } accum_20 = accum_20 + accum_21 + accum_22 + accum_23; diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c index 8e6cb9a47..2eb5b9538 100644 --- a/kernel/x86_64/sasum_microk_haswell-2.c +++ b/kernel/x86_64/sasum_microk_haswell-2.c @@ -38,10 +38,10 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) __m256i abs_mask = _mm256_set1_epi32(0x7fffffff); for (i = 0; i < tail_index_AVX2; i += 32) { - accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); - accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); - accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+16]), abs_mask); - accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+24]), abs_mask); + accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask); + accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask); + accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+16]), abs_mask); + accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+24]), abs_mask); } accum_0 = accum_0 + accum_1 + accum_2 + accum_3; @@ -62,8 +62,8 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff); for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { - accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); - accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); + accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); } accum_20 += accum_21;