From cb3c190a3a46057782fb518e81b51fc7909e01d8 Mon Sep 17 00:00:00 2001 From: Gengxin Xie Date: Fri, 21 Aug 2020 14:44:36 +0800 Subject: [PATCH 1/3] Implementaion of dasum, sasum with AVX2 & AVX512 intrinsic --- kernel/x86_64/KERNEL.HASWELL | 2 + kernel/x86_64/dasum.c | 96 ++++++++++++++++++++++ kernel/x86_64/dasum_microk_haswell-2.c | 35 ++++++++ kernel/x86_64/dasum_microk_skylakex-2.c | 27 ++++++ kernel/x86_64/sasum.c | 104 ++++++++++++++++++++++++ kernel/x86_64/sasum_microk_haswell-2.c | 36 ++++++++ kernel/x86_64/sasum_microk_skylakex-2.c | 27 ++++++ 7 files changed, 327 insertions(+) create mode 100644 kernel/x86_64/dasum.c create mode 100644 kernel/x86_64/dasum_microk_haswell-2.c create mode 100644 kernel/x86_64/dasum_microk_skylakex-2.c create mode 100644 kernel/x86_64/sasum.c create mode 100644 kernel/x86_64/sasum_microk_haswell-2.c create mode 100644 kernel/x86_64/sasum_microk_skylakex-2.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index ef8b36a57..b979fc0ae 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -100,3 +100,5 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c +SASUMKERNEL = sasum.c +DASUMKERNEL = dasum.c diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c new file mode 100644 index 000000000..31313416b --- /dev/null +++ b/kernel/x86_64/dasum.c @@ -0,0 +1,96 @@ +#include "common.h" +#include + +#define ABS fabs + +#if defined(SKYLAKEX) +#include "dasum_microk_skylakex-2.c" +#elif defined(HASWELL) +#include "dasum_microk_haswell-2.c" +#endif + +#ifndef HAVE_KERNEL_16 +static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) +{ + + BLASLONG i=0; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + + while ( i< n ) + { + + temp0 = ABS(x[0]); + temp1 = ABS(x[1]); + temp2 = ABS(x[2]); + temp3 = ABS(x[3]); + temp4 = ABS(x[4]); + temp5 = ABS(x[5]); + temp6 = ABS(x[6]); + temp7 = ABS(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=8; + + } + + return sum0+sum1+sum2+sum3; +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG n1; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) + { + + n1 = n & -16; + if ( n1 > 0 ) + { + + sumf = dasum_kernel_16(n1, x); + i=n1; + } + + while(i < n) + { + sumf += ABS(x[i]); + i++; + } + + } + else + { + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + + } + return(sumf); +} + diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c new file mode 100644 index 000000000..bf9d85e73 --- /dev/null +++ b/kernel/x86_64/dasum_microk_haswell-2.c @@ -0,0 +1,35 @@ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_KERNEL_16 1 + +#include + +static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) +{ + BLASLONG i = 0; + __m256d accum_0, accum_1, accum_2, accum_3; + + accum_0 = _mm256_setzero_pd(); + accum_1 = _mm256_setzero_pd(); + accum_2 = _mm256_setzero_pd(); + accum_3 = _mm256_setzero_pd(); + + __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff); + for (; i < n; i += 16) { + accum_0 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 0]), abs_mask); + accum_1 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 4]), abs_mask); + accum_2 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 8]), abs_mask); + accum_3 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+12]), abs_mask); + } + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + + __m128d half_accum0; + half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); + + half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); + + return half_accum0[0]; + +} +#endif diff --git a/kernel/x86_64/dasum_microk_skylakex-2.c b/kernel/x86_64/dasum_microk_skylakex-2.c new file mode 100644 index 000000000..2c959b1ad --- /dev/null +++ b/kernel/x86_64/dasum_microk_skylakex-2.c @@ -0,0 +1,27 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#if defined(__AVX512CD__) +#define HAVE_KERNEL_16 1 + +#include + +static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) +{ + BLASLONG i = 0; + + __m512d accum_0, accum_1; + + accum_0 = _mm512_setzero_pd(); + accum_1 = _mm512_setzero_pd(); + + for (; i < n; i += 16) { + accum_0 += _mm512_abs_pd(_mm512_loadu_pd(&x1[i+ 0])); + accum_1 += _mm512_abs_pd(_mm512_loadu_pd(&x1[i+ 8])); + } + + accum_0 += accum_1; + return _mm512_reduce_add_pd(accum_0); +} +#endif +#endif diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c new file mode 100644 index 000000000..601255546 --- /dev/null +++ b/kernel/x86_64/sasum.c @@ -0,0 +1,104 @@ +#include "common.h" +#include + +#if defined(DOUBLE) + +#error supports float only + +#else + +#define ABS fabsf + +#endif + +#if defined(SKYLAKEX) +#include "sasum_microk_skylakex-2.c" +#elif defined(HASWELL) +#include "sasum_microk_haswell-2.c" +#endif + +#ifndef HAVE_KERNEL_32 + +static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1) +{ + + BLASLONG i=0; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + + while ( i< n ) + { + + temp0 = ABS(x[0]); + temp1 = ABS(x[1]); + temp2 = ABS(x[2]); + temp3 = ABS(x[3]); + temp4 = ABS(x[4]); + temp5 = ABS(x[5]); + temp6 = ABS(x[6]); + temp7 = ABS(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=8; + + } + + return sum0+sum1+sum2+sum3; +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG n1; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) + { + + n1 = n & -32; + if ( n1 > 0 ) + { + + sumf = sasum_kernel_32(n1, x); + i=n1; + } + + while(i < n) + { + sumf += ABS(x[i]); + i++; + } + + } + else + { + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + + } + return(sumf); +} diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c new file mode 100644 index 000000000..f46e76ebf --- /dev/null +++ b/kernel/x86_64/sasum_microk_haswell-2.c @@ -0,0 +1,36 @@ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_KERNEL_32 1 + +#include + +static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1) +{ + BLASLONG i = 0; + __m256 accum_0, accum_1, accum_2, accum_3; + + accum_0 = _mm256_setzero_ps(); + accum_1 = _mm256_setzero_ps(); + accum_2 = _mm256_setzero_ps(); + accum_3 = _mm256_setzero_ps(); + + __m256i abs_mask = _mm256_set1_epi32(0x7fffffff); + for (; i < n; i += 32) { + accum_0 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 0]), abs_mask); + accum_1 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 8]), abs_mask); + accum_2 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+16]), abs_mask); + accum_3 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+24]), abs_mask); + } + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + + __m128 half_accum0; + half_accum0 = _mm_add_ps(_mm256_extractf128_ps(accum_0, 0), _mm256_extractf128_ps(accum_0, 1)); + + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + + return half_accum0[0]; + +} +#endif diff --git a/kernel/x86_64/sasum_microk_skylakex-2.c b/kernel/x86_64/sasum_microk_skylakex-2.c new file mode 100644 index 000000000..b1c49fd09 --- /dev/null +++ b/kernel/x86_64/sasum_microk_skylakex-2.c @@ -0,0 +1,27 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#if defined(__AVX512CD__) +#define HAVE_KERNEL_32 1 + +#include + +static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1) +{ + BLASLONG i = 0; + + __m512 accum_0, accum_1; + + accum_0 = _mm512_setzero_ps(); + accum_1 = _mm512_setzero_ps(); + + for (; i < n; i += 32) { + accum_0 += _mm512_abs_ps(_mm512_loadu_ps(&x1[i+ 0])); + accum_1 += _mm512_abs_ps(_mm512_loadu_ps(&x1[i+ 16])); + } + + accum_0 += accum_1; + return _mm512_reduce_add_ps(accum_0); +} +#endif +#endif From 448152cdd809c6ab16f1767660e2f4b5b3aa4ef6 Mon Sep 17 00:00:00 2001 From: Gengxin Xie Date: Mon, 31 Aug 2020 14:39:08 +0800 Subject: [PATCH 2/3] define __AVX2__ to ensure the haswell code compiled with avx2 --- kernel/x86_64/dasum_microk_haswell-2.c | 2 +- kernel/x86_64/sasum_microk_haswell-2.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c index bf9d85e73..7639dfd04 100644 --- a/kernel/x86_64/dasum_microk_haswell-2.c +++ b/kernel/x86_64/dasum_microk_haswell-2.c @@ -1,4 +1,4 @@ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) #define HAVE_KERNEL_16 1 diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c index f46e76ebf..b628729f5 100644 --- a/kernel/x86_64/sasum_microk_haswell-2.c +++ b/kernel/x86_64/sasum_microk_haswell-2.c @@ -1,4 +1,4 @@ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) #define HAVE_KERNEL_32 1 From 1b0f17eeed840d8e9642afd7d801259279d587cf Mon Sep 17 00:00:00 2001 From: Gengxin Xie Date: Tue, 1 Sep 2020 15:41:48 +0800 Subject: [PATCH 3/3] align to 64, using SSE when input size is small --- kernel/x86_64/dasum.c | 132 +++++++++++------------- kernel/x86_64/dasum_microk_haswell-2.c | 87 ++++++++++++---- kernel/x86_64/dasum_microk_skylakex-2.c | 81 ++++++++++++--- kernel/x86_64/sasum.c | 124 ++++++++++------------ kernel/x86_64/sasum_microk_haswell-2.c | 84 +++++++++++---- kernel/x86_64/sasum_microk_skylakex-2.c | 74 ++++++++++--- 6 files changed, 375 insertions(+), 207 deletions(-) diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c index 31313416b..8a40ea4b9 100644 --- a/kernel/x86_64/dasum.c +++ b/kernel/x86_64/dasum.c @@ -1,7 +1,8 @@ #include "common.h" -#include -#define ABS fabs +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif #if defined(SKYLAKEX) #include "dasum_microk_skylakex-2.c" @@ -9,88 +10,73 @@ #include "dasum_microk_haswell-2.c" #endif -#ifndef HAVE_KERNEL_16 -static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) +#ifndef HAVE_DASUM_KERNEL +static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) { - BLASLONG i=0; - FLOAT *x = x1; - FLOAT temp0, temp1, temp2, temp3; - FLOAT temp4, temp5, temp6, temp7; - FLOAT sum0 = 0.0; - FLOAT sum1 = 0.0; - FLOAT sum2 = 0.0; - FLOAT sum3 = 0.0; + BLASLONG i=0; + BLASLONG n_8 = n & -8; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + FLOAT sum4 = 0.0; + + while (i < n_8) { + temp0 = ABS_K(x[0]); + temp1 = ABS_K(x[1]); + temp2 = ABS_K(x[2]); + temp3 = ABS_K(x[3]); + temp4 = ABS_K(x[4]); + temp5 = ABS_K(x[5]); + temp6 = ABS_K(x[6]); + temp7 = ABS_K(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=8; + } - while ( i< n ) - { + while (i < n) { + sum4 += ABS_K(x1[i]); + i++; + } - temp0 = ABS(x[0]); - temp1 = ABS(x[1]); - temp2 = ABS(x[2]); - temp3 = ABS(x[3]); - temp4 = ABS(x[4]); - temp5 = ABS(x[5]); - temp6 = ABS(x[6]); - temp7 = ABS(x[7]); - - sum0 += temp0; - sum1 += temp1; - sum2 += temp2; - sum3 += temp3; - - sum0 += temp4; - sum1 += temp5; - sum2 += temp6; - sum3 += temp7; - - x+=8; - i+=8; - - } - - return sum0+sum1+sum2+sum3; + return sum0+sum1+sum2+sum3+sum4; } #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0; - FLOAT sumf = 0.0; - BLASLONG n1; + BLASLONG i=0; + FLOAT sumf = 0.0; - if (n <= 0 || inc_x <= 0) return(sumf); + if (n <= 0 || inc_x <= 0) return(sumf); - if ( inc_x == 1 ) - { - - n1 = n & -16; - if ( n1 > 0 ) - { - - sumf = dasum_kernel_16(n1, x); - i=n1; - } - - while(i < n) - { - sumf += ABS(x[i]); - i++; - } - - } - else - { - - n *= inc_x; - while(i < n) - { - sumf += ABS(x[i]); - i += inc_x; - } - - } - return(sumf); + if ( inc_x == 1 ) { + sumf = dasum_kernel(n, x); + } + else { + n *= inc_x; + + while(i < n) { + sumf += ABS_K(x[i]); + i += inc_x; + } + } + return(sumf); } diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c index 7639dfd04..4fc73ddd4 100644 --- a/kernel/x86_64/dasum_microk_haswell-2.c +++ b/kernel/x86_64/dasum_microk_haswell-2.c @@ -1,35 +1,86 @@ #if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) -#define HAVE_KERNEL_16 1 +#define HAVE_DASUM_KERNEL #include +#include -static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif + +static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) { BLASLONG i = 0; - __m256d accum_0, accum_1, accum_2, accum_3; + FLOAT sumf = 0.0; - accum_0 = _mm256_setzero_pd(); - accum_1 = _mm256_setzero_pd(); - accum_2 = _mm256_setzero_pd(); - accum_3 = _mm256_setzero_pd(); + if (n >= 256) { + BLASLONG align_256 = ((32 - ((uintptr_t)x1 & (uintptr_t)0x1f)) >> 3) & 0x3; - __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff); - for (; i < n; i += 16) { - accum_0 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 0]), abs_mask); - accum_1 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 4]), abs_mask); - accum_2 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 8]), abs_mask); - accum_3 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+12]), abs_mask); + for (i = 0; i < align_256; i++) { + sumf += ABS_K(x1[i]); + } + + n -= align_256; + x1 += align_256; } - accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + BLASLONG tail_index_SSE = n&(~7); + BLASLONG tail_index_AVX2 = n&(~255); - __m128d half_accum0; - half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); + if (n >= 256) { + __m256d accum_0, accum_1, accum_2, accum_3; - half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); + accum_0 = _mm256_setzero_pd(); + accum_1 = _mm256_setzero_pd(); + accum_2 = _mm256_setzero_pd(); + accum_3 = _mm256_setzero_pd(); - return half_accum0[0]; + __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff); + for (i = 0; i < tail_index_AVX2; i += 16) { + accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); + accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 4]), abs_mask); + accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); + accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+12]), abs_mask); + } + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + + __m128d half_accum0; + half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); + + half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); + + sumf += half_accum0[0]; + } + + if (n >= 8) { + __m128d accum_20, accum_21, accum_22, accum_23; + accum_20 = _mm_setzero_pd(); + accum_21 = _mm_setzero_pd(); + accum_22 = _mm_setzero_pd(); + accum_23 = _mm_setzero_pd(); + + __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); + for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { + accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); + accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); + accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); + } + + accum_20 = accum_20 + accum_21 + accum_22 + accum_23; + __m128d half_accum20; + half_accum20 = _mm_hadd_pd(accum_20, accum_20); + + sumf += half_accum20[0]; + } + + for (i = tail_index_SSE; i < n; ++i) { + sumf += ABS_K(x1[i]); + } + + return sumf; } #endif diff --git a/kernel/x86_64/dasum_microk_skylakex-2.c b/kernel/x86_64/dasum_microk_skylakex-2.c index 2c959b1ad..aea8c02d9 100644 --- a/kernel/x86_64/dasum_microk_skylakex-2.c +++ b/kernel/x86_64/dasum_microk_skylakex-2.c @@ -1,27 +1,80 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) -#if defined(__AVX512CD__) -#define HAVE_KERNEL_16 1 +#define HAVE_DASUM_KERNEL 1 #include -static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) +#include + +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif + +static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) { BLASLONG i = 0; + FLOAT sumf = 0.0; - __m512d accum_0, accum_1; + if (n >= 256) { + BLASLONG align_512 = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 3) & 0x7; - accum_0 = _mm512_setzero_pd(); - accum_1 = _mm512_setzero_pd(); - - for (; i < n; i += 16) { - accum_0 += _mm512_abs_pd(_mm512_loadu_pd(&x1[i+ 0])); - accum_1 += _mm512_abs_pd(_mm512_loadu_pd(&x1[i+ 8])); + for (i = 0; i < align_512; i++) { + sumf += ABS_K(x1[i]); + } + + n -= align_512; + x1 += align_512; } - accum_0 += accum_1; - return _mm512_reduce_add_pd(accum_0); + BLASLONG tail_index_SSE = n&(~7); + BLASLONG tail_index_AVX512 = n&(~255); + + // + if ( n >= 256 ) { + + __m512d accum_0, accum_1, accum_2, accum_3; + accum_0 = _mm512_setzero_pd(); + accum_1 = _mm512_setzero_pd(); + accum_2 = _mm512_setzero_pd(); + accum_3 = _mm512_setzero_pd(); + for (i = 0; i < tail_index_AVX512; i += 32) { + accum_0 += _mm512_abs_pd(_mm512_load_pd(&x1[i + 0])); + accum_1 += _mm512_abs_pd(_mm512_load_pd(&x1[i + 8])); + accum_2 += _mm512_abs_pd(_mm512_load_pd(&x1[i +16])); + accum_3 += _mm512_abs_pd(_mm512_load_pd(&x1[i +24])); + } + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + sumf += _mm512_reduce_add_pd(accum_0); + } + + if (n >= 8) { + __m128d accum_20, accum_21, accum_22, accum_23; + accum_20 = _mm_setzero_pd(); + accum_21 = _mm_setzero_pd(); + accum_22 = _mm_setzero_pd(); + accum_23 = _mm_setzero_pd(); + + __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); + for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) { + accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); + accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); + accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); + } + + accum_20 = accum_20 + accum_21 + accum_22 + accum_23; + __m128d half_accum20; + half_accum20 = _mm_hadd_pd(accum_20, accum_20); + + sumf += half_accum20[0]; + } + + for (i = tail_index_SSE; i < n; ++i) { + sumf += ABS_K(x1[i]); + } + + return sumf; } #endif -#endif diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c index 601255546..36ec4a737 100644 --- a/kernel/x86_64/sasum.c +++ b/kernel/x86_64/sasum.c @@ -1,13 +1,11 @@ #include "common.h" -#include #if defined(DOUBLE) - #error supports float only - #else - -#define ABS fabsf +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif #endif @@ -17,88 +15,76 @@ #include "sasum_microk_haswell-2.c" #endif -#ifndef HAVE_KERNEL_32 +#ifndef HAVE_SASUM_KERNEL -static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1) +static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) { - BLASLONG i=0; - FLOAT *x = x1; - FLOAT temp0, temp1, temp2, temp3; - FLOAT temp4, temp5, temp6, temp7; - FLOAT sum0 = 0.0; - FLOAT sum1 = 0.0; - FLOAT sum2 = 0.0; - FLOAT sum3 = 0.0; + BLASLONG i=0; + BLASLONG n_8 = n & -8; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + FLOAT sum4 = 0.0; - while ( i< n ) - { + while (i < n_8) { - temp0 = ABS(x[0]); - temp1 = ABS(x[1]); - temp2 = ABS(x[2]); - temp3 = ABS(x[3]); - temp4 = ABS(x[4]); - temp5 = ABS(x[5]); - temp6 = ABS(x[6]); - temp7 = ABS(x[7]); + temp0 = ABS_K(x[0]); + temp1 = ABS_K(x[1]); + temp2 = ABS_K(x[2]); + temp3 = ABS_K(x[3]); + temp4 = ABS_K(x[4]); + temp5 = ABS_K(x[5]); + temp6 = ABS_K(x[6]); + temp7 = ABS_K(x[7]); - sum0 += temp0; - sum1 += temp1; - sum2 += temp2; - sum3 += temp3; + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; - sum0 += temp4; - sum1 += temp5; - sum2 += temp6; - sum3 += temp7; + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; - x+=8; - i+=8; + x+=8; + i+=8; - } + } - return sum0+sum1+sum2+sum3; + while (i < n) { + sum4 += ABS_K(x1[i]); + i++; + } + + return sum0+sum1+sum2+sum3+sum4; } #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0; - FLOAT sumf = 0.0; - BLASLONG n1; + BLASLONG i=0; + FLOAT sumf = 0.0; - if (n <= 0 || inc_x <= 0) return(sumf); + if (n <= 0 || inc_x <= 0) return(sumf); - if ( inc_x == 1 ) - { + if ( inc_x == 1 ) { + sumf = sasum_kernel(n, x); + } + else { - n1 = n & -32; - if ( n1 > 0 ) - { + n *= inc_x; + while(i < n) { + sumf += ABS_K(x[i]); + i += inc_x; + } - sumf = sasum_kernel_32(n1, x); - i=n1; - } - - while(i < n) - { - sumf += ABS(x[i]); - i++; - } - - } - else - { - - n *= inc_x; - while(i < n) - { - sumf += ABS(x[i]); - i += inc_x; - } - - } - return(sumf); + } + return(sumf); } diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c index b628729f5..8e6cb9a47 100644 --- a/kernel/x86_64/sasum_microk_haswell-2.c +++ b/kernel/x86_64/sasum_microk_haswell-2.c @@ -1,36 +1,82 @@ #if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) -#define HAVE_KERNEL_32 1 +#define HAVE_SASUM_KERNEL 1 #include +#include -static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1) +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif + +static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) { BLASLONG i = 0; - __m256 accum_0, accum_1, accum_2, accum_3; + FLOAT sumf = 0.0; - accum_0 = _mm256_setzero_ps(); - accum_1 = _mm256_setzero_ps(); - accum_2 = _mm256_setzero_ps(); - accum_3 = _mm256_setzero_ps(); + if (n >= 256) { + BLASLONG align_256 = ((32 - ((uintptr_t)x1 & (uintptr_t)0x1f)) >> 2) & 0x7; - __m256i abs_mask = _mm256_set1_epi32(0x7fffffff); - for (; i < n; i += 32) { - accum_0 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 0]), abs_mask); - accum_1 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 8]), abs_mask); - accum_2 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+16]), abs_mask); - accum_3 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+24]), abs_mask); + for (i = 0; i < align_256; i++) { + sumf += ABS_K(x1[i]); + } + + n -= align_256; + x1 += align_256; } - accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + BLASLONG tail_index_SSE = n&(~7); + BLASLONG tail_index_AVX2 = n&(~255); - __m128 half_accum0; - half_accum0 = _mm_add_ps(_mm256_extractf128_ps(accum_0, 0), _mm256_extractf128_ps(accum_0, 1)); + if (n >= 256) { + __m256 accum_0, accum_1, accum_2, accum_3; + + accum_0 = _mm256_setzero_ps(); + accum_1 = _mm256_setzero_ps(); + accum_2 = _mm256_setzero_ps(); + accum_3 = _mm256_setzero_ps(); - half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); - half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + __m256i abs_mask = _mm256_set1_epi32(0x7fffffff); + for (i = 0; i < tail_index_AVX2; i += 32) { + accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); + accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); + accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+16]), abs_mask); + accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+24]), abs_mask); + } - return half_accum0[0]; + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + __m128 half_accum0; + half_accum0 = _mm_add_ps(_mm256_extractf128_ps(accum_0, 0), _mm256_extractf128_ps(accum_0, 1)); + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + + sumf += half_accum0[0]; + + } + + if (n >= 8) { + __m128 accum_20, accum_21; + accum_20 = _mm_setzero_ps(); + accum_21 = _mm_setzero_ps(); + + __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff); + for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { + accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); + accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + } + + accum_20 += accum_21; + accum_20 = _mm_hadd_ps(accum_20, accum_20); + accum_20 = _mm_hadd_ps(accum_20, accum_20); + + sumf += accum_20[0]; + } + + for (i = tail_index_SSE; i < n; ++i) { + sumf += ABS_K(x1[i]); + } + + return sumf; } #endif diff --git a/kernel/x86_64/sasum_microk_skylakex-2.c b/kernel/x86_64/sasum_microk_skylakex-2.c index b1c49fd09..c8c69d1e0 100644 --- a/kernel/x86_64/sasum_microk_skylakex-2.c +++ b/kernel/x86_64/sasum_microk_skylakex-2.c @@ -1,27 +1,73 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) -#if defined(__AVX512CD__) -#define HAVE_KERNEL_32 1 +#define HAVE_SASUM_KERNEL 1 + +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif #include +#include -static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1) +static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) { BLASLONG i = 0; + FLOAT sumf = 0.0; - __m512 accum_0, accum_1; + if (n >= 256) { + BLASLONG align_512 = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 2) & 0xf; - accum_0 = _mm512_setzero_ps(); - accum_1 = _mm512_setzero_ps(); - - for (; i < n; i += 32) { - accum_0 += _mm512_abs_ps(_mm512_loadu_ps(&x1[i+ 0])); - accum_1 += _mm512_abs_ps(_mm512_loadu_ps(&x1[i+ 16])); + for (i = 0; i < align_512; i++) { + sumf += ABS_K(x1[i]); + } + n -= align_512; + x1 += align_512; } - accum_0 += accum_1; - return _mm512_reduce_add_ps(accum_0); + BLASLONG tail_index_SSE = n&(~7); + BLASLONG tail_index_AVX512 = n&(~255); + + if (n >= 256) { + __m512 accum_0, accum_1, accum_2, accum_3; + accum_0 = _mm512_setzero_ps(); + accum_1 = _mm512_setzero_ps(); + accum_2 = _mm512_setzero_ps(); + accum_3 = _mm512_setzero_ps(); + + for (i = 0; i < tail_index_AVX512; i += 64) { + accum_0 += _mm512_abs_ps(_mm512_load_ps(&x1[i + 0])); + accum_1 += _mm512_abs_ps(_mm512_load_ps(&x1[i +16])); + accum_2 += _mm512_abs_ps(_mm512_load_ps(&x1[i +32])); + accum_3 += _mm512_abs_ps(_mm512_load_ps(&x1[i +48])); + } + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + sumf += _mm512_reduce_add_ps(accum_0); + } + + if (n >= 8) { + __m128 accum_20, accum_21; + accum_20 = _mm_setzero_ps(); + accum_21 = _mm_setzero_ps(); + + __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff); + for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) { + accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); + accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + } + + accum_20 += accum_21; + accum_20 = _mm_hadd_ps(accum_20, accum_20); + accum_20 = _mm_hadd_ps(accum_20, accum_20); + + sumf += accum_20[0]; + } + + for (i = tail_index_SSE; i < n; i++) { + sumf += ABS_K(x1[i]); + } + + return sumf; } #endif -#endif