From c59652f0ce88ea7bba97704f332c3ec77bd528c9 Mon Sep 17 00:00:00 2001 From: pnp Date: Fri, 30 Apr 2021 12:14:58 -0400 Subject: [PATCH 1/3] optimize on sgemv_n for small n --- kernel/x86_64/sgemv_n_4.c | 56 ++++- kernel/x86_64/sgemv_n_microk_skylakex-8.c | 258 ++++++++++++++++++++++ 2 files changed, 304 insertions(+), 10 deletions(-) create mode 100644 kernel/x86_64/sgemv_n_microk_skylakex-8.c diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 3eec21774..81d495eae 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -35,8 +35,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_nehalem-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_n_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) #include "sgemv_n_microk_haswell-4.c" +#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#include "sgemv_n_microk_haswell-4.c" +#include "sgemv_n_microk_skylakex-8.c" +#endif + #endif #if defined(STEAMROLLER) || defined(EXCAVATOR) @@ -291,6 +296,41 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + if ( m < 1 || n < 1) return(0); + + #ifdef HAVE_SGEMV_N_SKYLAKE_KERNEL + if (m <= 16384 && n <= 48 && !(n == 4)) + { + FLOAT * xbuffer_align = x; + FLOAT * ybuffer_align = y; + + FLOAT * xbuffer = NULL; + FLOAT * ybuffer = NULL; + + if (inc_x != 1) { + xbuffer_align = buffer; + for(BLASLONG i=0; i= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_SGEMV_N_SKYLAKE_KERNEL 1 +#include "common.h" +#include +static int sgemv_kernel_n_128(BLASLONG m, BLASLONG n, float alpha, float *a, BLASLONG lda, float *x, float *y) +{ + __m512 matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; + __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7; + __m512 xArray_0; + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); + BLASLONG tag_m_128x = m & (~127); + BLASLONG tag_m_64x = m & (~63); + BLASLONG tag_m_32x = m & (~31); + BLASLONG tag_m_16x = m & (~15); + + for (BLASLONG idx_m = 0; idx_m < tag_m_128x; idx_m+=128) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + accum512_2 = _mm512_setzero_ps(); + accum512_3 = _mm512_setzero_ps(); + accum512_4 = _mm512_setzero_ps(); + accum512_5 = _mm512_setzero_ps(); + accum512_6 = _mm512_setzero_ps(); + accum512_7 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + + matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 16]); + matrixArray_2 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 32]); + matrixArray_3 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 48]); + matrixArray_4 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 64]); + matrixArray_5 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 80]); + matrixArray_6 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 96]); + matrixArray_7 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 112]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + accum512_1 = _mm512_fmadd_ps(matrixArray_1, xArray_0, accum512_1); + accum512_2 = _mm512_fmadd_ps(matrixArray_2, xArray_0, accum512_2); + accum512_3 = _mm512_fmadd_ps(matrixArray_3, xArray_0, accum512_3); + accum512_4 = _mm512_fmadd_ps(matrixArray_4, xArray_0, accum512_4); + accum512_5 = _mm512_fmadd_ps(matrixArray_5, xArray_0, accum512_5); + accum512_6 = _mm512_fmadd_ps(matrixArray_6, xArray_0, accum512_6); + accum512_7 = _mm512_fmadd_ps(matrixArray_7, xArray_0, accum512_7); + } + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(accum512_1, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + _mm512_storeu_ps(&y[idx_m + 32], _mm512_fmadd_ps(accum512_2, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 32]))); + _mm512_storeu_ps(&y[idx_m + 48], _mm512_fmadd_ps(accum512_3, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 48]))); + _mm512_storeu_ps(&y[idx_m + 64], _mm512_fmadd_ps(accum512_4, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 64]))); + _mm512_storeu_ps(&y[idx_m + 80], _mm512_fmadd_ps(accum512_5, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 80]))); + _mm512_storeu_ps(&y[idx_m + 96], _mm512_fmadd_ps(accum512_6, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 96]))); + _mm512_storeu_ps(&y[idx_m + 112], _mm512_fmadd_ps(accum512_7, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 112]))); + } + if (tag_m_128x != m) { + for (BLASLONG idx_m = tag_m_128x; idx_m < tag_m_64x; idx_m+=64) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + accum512_2 = _mm512_setzero_ps(); + accum512_3 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + + matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 16]); + matrixArray_2 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 32]); + matrixArray_3 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 48]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + accum512_1 = _mm512_fmadd_ps(matrixArray_1, xArray_0, accum512_1); + accum512_2 = _mm512_fmadd_ps(matrixArray_2, xArray_0, accum512_2); + accum512_3 = _mm512_fmadd_ps(matrixArray_3, xArray_0, accum512_3); + } + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(accum512_1, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + _mm512_storeu_ps(&y[idx_m + 32], _mm512_fmadd_ps(accum512_2, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 32]))); + _mm512_storeu_ps(&y[idx_m + 48], _mm512_fmadd_ps(accum512_3, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 48]))); + } + + if(tag_m_64x != m) { + for (BLASLONG idx_m = tag_m_64x; idx_m < tag_m_32x; idx_m+=32) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + + matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 16]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + accum512_1 = _mm512_fmadd_ps(matrixArray_1, xArray_0, accum512_1); + } + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(accum512_1, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + } + + if(tag_m_32x != m) { + + for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) { + accum512_0 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + + matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + } + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + } + + if (tag_m_16x != m) { + accum512_0 = _mm512_setzero_ps(); + + unsigned short tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15))); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + + for(BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + matrixArray_0 = _mm512_maskz_loadu_ps(tail_mask, &a[idx_n * lda + tag_m_16x]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + } + + _mm512_mask_storeu_ps(&y[tag_m_16x], tail_mask, _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_maskz_loadu_ps(tail_mask, &y[tag_m_16x]))); + + } + } + } + } + return 0; +} + +static int sgemv_kernel_n_64(BLASLONG m, BLASLONG n, float alpha, float *a, BLASLONG lda, float *x, float *y) +{ + __m256 ma0, ma1, ma2, ma3, ma4, ma5, ma6, ma7; + __m256 as0, as1, as2, as3, as4, as5, as6, as7; + __m256 alphav = _mm256_set1_ps(alpha); + __m256 xv; + BLASLONG tag_m_32x = m & (~31); + BLASLONG tag_m_16x = m & (~15); + BLASLONG tag_m_8x = m & (~7); + __mmask8 one_mask = 0xff; + + for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) { + as0 = _mm256_setzero_ps(); + as1 = _mm256_setzero_ps(); + as2 = _mm256_setzero_ps(); + as3 = _mm256_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xv = _mm256_set1_ps(x[idx_n]); + ma0 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +0]); + ma1 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +8]); + ma2 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +16]); + ma3 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +24]); + + as0 = _mm256_maskz_fmadd_ps(one_mask, ma0, xv, as0); + as1 = _mm256_maskz_fmadd_ps(one_mask, ma1, xv, as1); + as2 = _mm256_maskz_fmadd_ps(one_mask, ma2, xv, as2); + as3 = _mm256_maskz_fmadd_ps(one_mask, ma3, xv, as3); + } + _mm256_mask_storeu_ps(&y[idx_m], one_mask, _mm256_maskz_fmadd_ps(one_mask, as0, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m]))); + _mm256_mask_storeu_ps(&y[idx_m + 8], one_mask, _mm256_maskz_fmadd_ps(one_mask, as1, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 8]))); + _mm256_mask_storeu_ps(&y[idx_m + 16], one_mask, _mm256_maskz_fmadd_ps(one_mask, as2, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 16]))); + _mm256_mask_storeu_ps(&y[idx_m + 24], one_mask, _mm256_maskz_fmadd_ps(one_mask, as3, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 24]))); + + } + + if (tag_m_32x != m ) { + for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) { + as4 = _mm256_setzero_ps(); + as5 = _mm256_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xv = _mm256_set1_ps(x[idx_n]); + ma4 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +0]); + ma5 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +8]); + + as4 = _mm256_maskz_fmadd_ps(one_mask, ma4, xv, as4); + as5 = _mm256_maskz_fmadd_ps(one_mask, ma5, xv, as5); + } + _mm256_mask_storeu_ps(&y[idx_m], one_mask, _mm256_maskz_fmadd_ps(one_mask, as4, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m]))); + _mm256_mask_storeu_ps(&y[idx_m + 8], one_mask, _mm256_maskz_fmadd_ps(one_mask, as5, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 8]))); + } + + if (tag_m_16x != m ) { + for (BLASLONG idx_m = tag_m_16x; idx_m < tag_m_8x; idx_m+=8) { + as6 = _mm256_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xv = _mm256_set1_ps(x[idx_n]); + ma6 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m]); + as6 = _mm256_maskz_fmadd_ps(one_mask, ma6, xv, as6); + } + _mm256_mask_storeu_ps(&y[idx_m], one_mask, _mm256_maskz_fmadd_ps(one_mask, as6, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m]))); + } + + if (tag_m_8x != m) { + as7 = _mm256_setzero_ps(); + + unsigned char tail_mask_uint = (((unsigned char)0xff) >> (8-(m&7))); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_uint); + + for(BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xv = _mm256_set1_ps(x[idx_n]); + ma7 = _mm256_maskz_loadu_ps(tail_mask, &a[idx_n * lda + tag_m_8x]); + + as7 = _mm256_maskz_fmadd_ps(tail_mask, ma7, xv, as7); + } + + _mm256_mask_storeu_ps(&y[tag_m_8x], tail_mask, _mm256_maskz_fmadd_ps(tail_mask, as7, alphav, _mm256_maskz_loadu_ps(tail_mask, &y[tag_m_8x]))); + + } + } + } + + return 0; +} + + +#endif \ No newline at end of file From 3d4ccd2a130447eb7e0b8f5326dcd6e856fb8de9 Mon Sep 17 00:00:00 2001 From: pnp Date: Fri, 30 Apr 2021 12:25:33 -0400 Subject: [PATCH 2/3] fix for build error --- kernel/x86_64/sgemv_n_4.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 81d495eae..bc006bf3c 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -42,8 +42,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_skylakex-8.c" #endif -#endif - #if defined(STEAMROLLER) || defined(EXCAVATOR) #define NBMAX 2048 #else From c0ca63ea4672c3b013136ef54a69e5ab967be270 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 5 May 2021 14:55:36 +0200 Subject: [PATCH 3/3] Fix missing conditionals for non-SKX kernels --- kernel/x86_64/sgemv_n_4.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index bc006bf3c..06de28d97 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -417,7 +417,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( n2 & 2 ) { +#ifdef HAVE_SGEMV_N_SKYLAKE_KERNEL sgemv_kernel_n_64(NB, 2, alpha, a_ptr, lda, x_ptr, ybuffer); +#else + sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); +#endif a_ptr += lda*2; x_ptr += 2; } @@ -425,7 +429,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( n2 & 1 ) { +#ifdef HAVE_SGEMV_N_SKYLAKE_KERNEL sgemv_kernel_n_64(NB, 1, alpha, a_ptr, lda, x_ptr, ybuffer); +#else + sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); +#endif /* a_ptr += lda; x_ptr += 1a; */