From d9ba49165af15d535d9b9955bd248eab4d259f06 Mon Sep 17 00:00:00 2001 From: Gengxin Xie Date: Sun, 27 Sep 2020 10:38:19 +0800 Subject: [PATCH 01/12] Improve the performance of rot by using AVX512 and AVX2 intrinsic --- driver/others/blas_l1_thread.c | 2 +- driver/others/blas_server_win32.c | 11 +- kernel/x86_64/KERNEL.HASWELL | 3 + kernel/x86_64/drot.c | 139 +++++++++++++++++++++++++ kernel/x86_64/drot_microk_haswell-2.c | 87 ++++++++++++++++ kernel/x86_64/drot_microk_skylakex-2.c | 94 +++++++++++++++++ kernel/x86_64/srot.c | 139 +++++++++++++++++++++++++ kernel/x86_64/srot_microk_haswell-2.c | 87 ++++++++++++++++ kernel/x86_64/srot_microk_skylakex-2.c | 91 ++++++++++++++++ 9 files changed, 648 insertions(+), 5 deletions(-) create mode 100644 kernel/x86_64/drot.c create mode 100644 kernel/x86_64/drot_microk_haswell-2.c create mode 100644 kernel/x86_64/drot_microk_skylakex-2.c create mode 100644 kernel/x86_64/srot.c create mode 100644 kernel/x86_64/srot_microk_haswell-2.c create mode 100644 kernel/x86_64/srot_microk_skylakex-2.c diff --git a/driver/others/blas_l1_thread.c b/driver/others/blas_l1_thread.c index 04acbcc5f..06039c952 100644 --- a/driver/others/blas_l1_thread.c +++ b/driver/others/blas_l1_thread.c @@ -80,7 +80,7 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha break; } - mode |= BLAS_LEGACY; + if(!(mode & BLAS_PTHREAD)) mode |= BLAS_LEGACY; for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]); diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index d2cc91757..f47908c70 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -476,12 +476,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ routine = queue -> routine; - if (!(queue -> mode & BLAS_LEGACY)) { + if (queue -> mode & BLAS_LEGACY) { + legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); + } else + if (queue -> mode & BLAS_PTHREAD) { + void (*pthreadcompat)(void *) = queue -> routine; + (pthreadcompat)(queue -> args); + } else (routine)(queue -> args, queue -> range_m, queue -> range_n, queue -> sa, queue -> sb, 0); - } else { - legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); - } if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index b979fc0ae..81eaf96ac 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -102,3 +102,6 @@ ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c SASUMKERNEL = sasum.c DASUMKERNEL = dasum.c + +SROTKERNEL = srot.c +DROTKERNEL = drot.c diff --git a/kernel/x86_64/drot.c b/kernel/x86_64/drot.c new file mode 100644 index 000000000..a312b7ff9 --- /dev/null +++ b/kernel/x86_64/drot.c @@ -0,0 +1,139 @@ +#include "common.h" + +#if defined(SKYLAKEX) +#include "drot_microk_skylakex-2.c" +#elif defined(HASWELL) +#include "drot_microk_haswell-2.c" +#endif + +#ifndef HAVE_DROT_KERNEL + +static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + FLOAT f0, f1, f2, f3; + FLOAT x0, x1, x2, x3; + FLOAT g0, g1, g2, g3; + FLOAT y0, y1, y2, y3; + + FLOAT* xp = x; + FLOAT* yp = y; + + BLASLONG n1 = n & (~7); + + while (i < n1) { + x0 = xp[0]; + y0 = yp[0]; + x1 = xp[1]; + y1 = yp[1]; + x2 = xp[2]; + y2 = yp[2]; + x3 = xp[3]; + y3 = yp[3]; + + f0 = c*x0 + s*y0; + g0 = c*y0 - s*x0; + f1 = c*x1 + s*y1; + g1 = c*y1 - s*x1; + f2 = c*x2 + s*y2; + g2 = c*y2 - s*x2; + f3 = c*x3 + s*y3; + g3 = c*y3 - s*x3; + + xp[0] = f0; + yp[0] = g0; + xp[1] = f1; + yp[1] = g1; + xp[2] = f2; + yp[2] = g2; + xp[3] = f3; + yp[3] = g3; + + xp += 4; + yp += 4; + i += 4; + } + + while (i < n) { + FLOAT temp = c*x[i] + s*y[i]; + y[i] = c*y[i] - s*x[i]; + x[i] = temp; + + i++; + } +} + +#endif +static void rot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + + FLOAT temp; + + if (n <= 0) + return; + if ((inc_x == 1) && (inc_y == 1)) { + drot_kernel(n, x, y, c, s); + } + else { + while (i < n) { + temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; + + ix += inc_x; + iy += inc_y; + i++; + } + } + return; +} + + +#if defined(SMP) +static int rot_thread_function(blas_arg_t *args) +{ + + rot_compute(args->m, + args->a, args->lda, + args->b, args->ldb, + ((FLOAT *)args->alpha)[0], + ((FLOAT *)args->alpha)[1]); + return 0; +} + +extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ +#if defined(SMP) + int nthreads; + FLOAT alpha[2]={c, s}; + FLOAT dummy_c; +#endif + +#if defined(SMP) + if (inc_x == 0 || inc_y == 0 || n <= 100000) { + nthreads = 1; + } + else { + nthreads = num_cpu_avail(1); + } + + if (nthreads == 1) { + rot_compute(n, x, inc_x, y, inc_y, c, s); + } + else { +#if defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL | BLAS_PTHREAD; +#else + int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD; +#endif + blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads); + } +#else + rot_compute(n, x, inc_x, y, inc_y, c, s); +#endif + return 0; +} diff --git a/kernel/x86_64/drot_microk_haswell-2.c b/kernel/x86_64/drot_microk_haswell-2.c new file mode 100644 index 000000000..72a87696e --- /dev/null +++ b/kernel/x86_64/drot_microk_haswell-2.c @@ -0,0 +1,87 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_DROT_KERNEL 1 + +#include +#include + +static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + + BLASLONG tail_index_4 = n&(~3); + BLASLONG tail_index_16 = n&(~15); + + __m256d c_256, s_256; + if (n >= 4) { + c_256 = _mm256_set1_pd(c); + s_256 = _mm256_set1_pd(s); + } + + __m256d x0, x1, x2, x3; + __m256d y0, y1, y2, y3; + __m256d t0, t1, t2, t3; + + for (i = 0; i < tail_index_16; i += 16) { + x0 = _mm256_loadu_pd(&x[i + 0]); + x1 = _mm256_loadu_pd(&x[i + 4]); + x2 = _mm256_loadu_pd(&x[i + 8]); + x3 = _mm256_loadu_pd(&x[i +12]); + y0 = _mm256_loadu_pd(&y[i + 0]); + y1 = _mm256_loadu_pd(&y[i + 4]); + y2 = _mm256_loadu_pd(&y[i + 8]); + y3 = _mm256_loadu_pd(&y[i +12]); + + t0 = _mm256_mul_pd(s_256, y0); + t1 = _mm256_mul_pd(s_256, y1); + t2 = _mm256_mul_pd(s_256, y2); + t3 = _mm256_mul_pd(s_256, y3); + + t0 = _mm256_fmadd_pd(c_256, x0, t0); + t1 = _mm256_fmadd_pd(c_256, x1, t1); + t2 = _mm256_fmadd_pd(c_256, x2, t2); + t3 = _mm256_fmadd_pd(c_256, x3, t3); + + _mm256_storeu_pd(&x[i + 0], t0); + _mm256_storeu_pd(&x[i + 4], t1); + _mm256_storeu_pd(&x[i + 8], t2); + _mm256_storeu_pd(&x[i +12], t3); + + t0 = _mm256_mul_pd(s_256, x0); + t1 = _mm256_mul_pd(s_256, x1); + t2 = _mm256_mul_pd(s_256, x2); + t3 = _mm256_mul_pd(s_256, x3); + + t0 = _mm256_fmsub_pd(c_256, y0, t0); + t1 = _mm256_fmsub_pd(c_256, y1, t1); + t2 = _mm256_fmsub_pd(c_256, y2, t2); + t3 = _mm256_fmsub_pd(c_256, y3, t3); + + _mm256_storeu_pd(&y[i + 0], t0); + _mm256_storeu_pd(&y[i + 4], t1); + _mm256_storeu_pd(&y[i + 8], t2); + _mm256_storeu_pd(&y[i +12], t3); + + } + + for (i = tail_index_16; i < tail_index_4; i += 4) { + x0 = _mm256_loadu_pd(&x[i]); + y0 = _mm256_loadu_pd(&y[i]); + + t0 = _mm256_mul_pd(s_256, y0); + t0 = _mm256_fmadd_pd(c_256, x0, t0); + _mm256_storeu_pd(&x[i], t0); + + t0 = _mm256_mul_pd(s_256, x0); + t0 = _mm256_fmsub_pd(c_256, y0, t0); + _mm256_storeu_pd(&y[i], t0); + } + + for (i = tail_index_4; i < n; ++i) { + FLOAT temp = c * x[i] + s * y[i]; + y[i] = c * y[i] - s * x[i]; + x[i] = temp; + } +} +#endif diff --git a/kernel/x86_64/drot_microk_skylakex-2.c b/kernel/x86_64/drot_microk_skylakex-2.c new file mode 100644 index 000000000..4e862e663 --- /dev/null +++ b/kernel/x86_64/drot_microk_skylakex-2.c @@ -0,0 +1,94 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_DROT_KERNEL 1 + +#include +#include + +static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + BLASLONG n1 = n; + + BLASLONG tail_index_8 = 0; + BLASLONG tail_index_32 = 0; + + __m512d c_512 = _mm512_set1_pd(c); + __m512d s_512 = _mm512_set1_pd(s); + + tail_index_8 = n1 & (~7); + tail_index_32 = n1 & (~31); + + + __m512d x0, x1, x2, x3; + __m512d y0, y1, y2, y3; + __m512d t0, t1, t2, t3; + + for (i = 0; i < tail_index_32; i += 32) { + x0 = _mm512_loadu_pd(&x[i + 0]); + x1 = _mm512_loadu_pd(&x[i + 8]); + x2 = _mm512_loadu_pd(&x[i +16]); + x3 = _mm512_loadu_pd(&x[i +24]); + y0 = _mm512_loadu_pd(&y[i + 0]); + y1 = _mm512_loadu_pd(&y[i + 8]); + y2 = _mm512_loadu_pd(&y[i +16]); + y3 = _mm512_loadu_pd(&y[i +24]); + + t0 = _mm512_mul_pd(s_512, y0); + t1 = _mm512_mul_pd(s_512, y1); + t2 = _mm512_mul_pd(s_512, y2); + t3 = _mm512_mul_pd(s_512, y3); + + t0 = _mm512_fmadd_pd(c_512, x0, t0); + t1 = _mm512_fmadd_pd(c_512, x1, t1); + t2 = _mm512_fmadd_pd(c_512, x2, t2); + t3 = _mm512_fmadd_pd(c_512, x3, t3); + + _mm512_storeu_pd(&x[i + 0], t0); + _mm512_storeu_pd(&x[i + 8], t1); + _mm512_storeu_pd(&x[i +16], t2); + _mm512_storeu_pd(&x[i +24], t3); + + t0 = _mm512_mul_pd(s_512, x0); + t1 = _mm512_mul_pd(s_512, x1); + t2 = _mm512_mul_pd(s_512, x2); + t3 = _mm512_mul_pd(s_512, x3); + + t0 = _mm512_fmsub_pd(c_512, y0, t0); + t1 = _mm512_fmsub_pd(c_512, y1, t1); + t2 = _mm512_fmsub_pd(c_512, y2, t2); + t3 = _mm512_fmsub_pd(c_512, y3, t3); + + _mm512_storeu_pd(&y[i + 0], t0); + _mm512_storeu_pd(&y[i + 8], t1); + _mm512_storeu_pd(&y[i +16], t2); + _mm512_storeu_pd(&y[i +24], t3); + } + + for (i = tail_index_32; i < tail_index_8; i += 8) { + x0 = _mm512_loadu_pd(&x[i]); + y0 = _mm512_loadu_pd(&y[i]); + + t0 = _mm512_mul_pd(s_512, y0); + t0 = _mm512_fmadd_pd(c_512, x0, t0); + _mm512_storeu_pd(&x[i], t0); + + t0 = _mm512_mul_pd(s_512, x0); + t0 = _mm512_fmsub_pd(c_512, y0, t0); + _mm512_storeu_pd(&y[i], t0); + } + + if ((n1&7) > 0) { + unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 -(n1&7))); + __m512d tail_x = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &x[tail_index_8]); + __m512d tail_y = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &y[tail_index_8]); + __m512d temp = _mm512_mul_pd(s_512, tail_y); + temp = _mm512_fmadd_pd(c_512, tail_x, temp); + _mm512_mask_storeu_pd(&x[tail_index_8],*((__mmask8*)&tail_mask8), temp); + temp = _mm512_mul_pd(s_512, tail_x); + temp = _mm512_fmsub_pd(c_512, tail_y, temp); + _mm512_mask_storeu_pd(&y[tail_index_8], *((__mmask8*)&tail_mask8), temp); + } +} +#endif diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c new file mode 100644 index 000000000..021c20d82 --- /dev/null +++ b/kernel/x86_64/srot.c @@ -0,0 +1,139 @@ +#include "common.h" + +#if defined(SKYLAKEX) +#include "srot_microk_skylakex-2.c" +#elif defined(HASWELL) +#include "srot_microk_haswell-2.c" +#endif + +#ifndef HAVE_SROT_KERNEL + +static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + FLOAT f0, f1, f2, f3; + FLOAT x0, x1, x2, x3; + FLOAT g0, g1, g2, g3; + FLOAT y0, y1, y2, y3; + + FLOAT* xp = x; + FLOAT* yp = y; + + BLASLONG n1 = n & (~7); + + while (i < n1) { + x0 = xp[0]; + y0 = yp[0]; + x1 = xp[1]; + y1 = yp[1]; + x2 = xp[2]; + y2 = yp[2]; + x3 = xp[3]; + y3 = yp[3]; + + f0 = c*x0 + s*y0; + g0 = c*y0 - s*x0; + f1 = c*x1 + s*y1; + g1 = c*y1 - s*x1; + f2 = c*x2 + s*y2; + g2 = c*y2 - s*x2; + f3 = c*x3 + s*y3; + g3 = c*y3 - s*x3; + + xp[0] = f0; + yp[0] = g0; + xp[1] = f1; + yp[1] = g1; + xp[2] = f2; + yp[2] = g2; + xp[3] = f3; + yp[3] = g3; + + xp += 4; + yp += 4; + i += 4; + } + + while (i < n) { + FLOAT temp = c*x[i] + s*y[i]; + y[i] = c*y[i] - s*x[i]; + x[i] = temp; + + i++; + } +} + +#endif +static void rot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + + FLOAT temp; + + if (n <= 0) + return; + if ((inc_x == 1) && (inc_y == 1)) { + srot_kernel(n, x, y, c, s); + } + else { + while (i < n) { + temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; + + ix += inc_x; + iy += inc_y; + i++; + } + } + return; +} + + +#if defined(SMP) +static int rot_thread_function(blas_arg_t *args) +{ + + rot_compute(args->m, + args->a, args->lda, + args->b, args->ldb, + ((float *)args->alpha)[0], + ((float *)args->alpha)[1]); + return 0; +} + +extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ +#if defined(SMP) + int nthreads; + FLOAT alpha[2]={c, s}; + FLOAT dummy_c; +#endif + +#if defined(SMP) + if (inc_x == 0 || inc_y == 0 || n <= 100000) { + nthreads = 1; + } + else { + nthreads = num_cpu_avail(1); + } + + if (nthreads == 1) { + rot_compute(n, x, inc_x, y, inc_y, c, s); + } + else { +#if defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL | BLAS_PTHREAD; +#else + int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD; +#endif + blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads); + } +#else + rot_compute(n, x, inc_x, y, inc_y, c, s); +#endif + return 0; +} diff --git a/kernel/x86_64/srot_microk_haswell-2.c b/kernel/x86_64/srot_microk_haswell-2.c new file mode 100644 index 000000000..cba962042 --- /dev/null +++ b/kernel/x86_64/srot_microk_haswell-2.c @@ -0,0 +1,87 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_SROT_KERNEL 1 + +#include +#include + +static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + + BLASLONG tail_index_8 = n&(~7); + BLASLONG tail_index_32 = n&(~31); + + __m256 c_256, s_256; + if (n >= 8) { + c_256 = _mm256_set1_ps(c); + s_256 = _mm256_set1_ps(s); + } + + __m256 x0, x1, x2, x3; + __m256 y0, y1, y2, y3; + __m256 t0, t1, t2, t3; + + for (i = 0; i < tail_index_32; i += 32) { + x0 = _mm256_loadu_ps(&x[i + 0]); + x1 = _mm256_loadu_ps(&x[i + 8]); + x2 = _mm256_loadu_ps(&x[i +16]); + x3 = _mm256_loadu_ps(&x[i +24]); + y0 = _mm256_loadu_ps(&y[i + 0]); + y1 = _mm256_loadu_ps(&y[i + 8]); + y2 = _mm256_loadu_ps(&y[i +16]); + y3 = _mm256_loadu_ps(&y[i +24]); + + t0 = _mm256_mul_ps(s_256, y0); + t1 = _mm256_mul_ps(s_256, y1); + t2 = _mm256_mul_ps(s_256, y2); + t3 = _mm256_mul_ps(s_256, y3); + + t0 = _mm256_fmadd_ps(c_256, x0, t0); + t1 = _mm256_fmadd_ps(c_256, x1, t1); + t2 = _mm256_fmadd_ps(c_256, x2, t2); + t3 = _mm256_fmadd_ps(c_256, x3, t3); + + _mm256_storeu_ps(&x[i + 0], t0); + _mm256_storeu_ps(&x[i + 8], t1); + _mm256_storeu_ps(&x[i +16], t2); + _mm256_storeu_ps(&x[i +24], t3); + + t0 = _mm256_mul_ps(s_256, x0); + t1 = _mm256_mul_ps(s_256, x1); + t2 = _mm256_mul_ps(s_256, x2); + t3 = _mm256_mul_ps(s_256, x3); + + t0 = _mm256_fmsub_ps(c_256, y0, t0); + t1 = _mm256_fmsub_ps(c_256, y1, t1); + t2 = _mm256_fmsub_ps(c_256, y2, t2); + t3 = _mm256_fmsub_ps(c_256, y3, t3); + + _mm256_storeu_ps(&y[i + 0], t0); + _mm256_storeu_ps(&y[i + 8], t1); + _mm256_storeu_ps(&y[i +16], t2); + _mm256_storeu_ps(&y[i +24], t3); + + } + + for (i = tail_index_32; i < tail_index_8; i += 8) { + x0 = _mm256_loadu_ps(&x[i]); + y0 = _mm256_loadu_ps(&y[i]); + + t0 = _mm256_mul_ps(s_256, y0); + t0 = _mm256_fmadd_ps(c_256, s0, t0); + _mm256_storeu_ps(&x[i], t0); + + t0 = _mm256_mul_ps(s_256, x0); + t0 = _mm256_fmsub_ps(c_256, y0, t0); + _mm256_storeu_ps(&y[i], t0); + } + + for (i = tail_index_8; i < n; ++i) { + FLOAT temp = c * x[i] + s * y[i]; + y[i] = c * y[i] - s * x[i]; + x[i] = temp; + } +} +#endif diff --git a/kernel/x86_64/srot_microk_skylakex-2.c b/kernel/x86_64/srot_microk_skylakex-2.c new file mode 100644 index 000000000..a21d1cf64 --- /dev/null +++ b/kernel/x86_64/srot_microk_skylakex-2.c @@ -0,0 +1,91 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_SROT_KERNEL 1 + +#include +#include + +static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + __m512 c_512, s_512; + c_512 = _mm512_set1_ps(c); + s_512 = _mm512_set1_ps(s); + + BLASLONG tail_index_16 = n&(~15); + BLASLONG tail_index_64 = n&(~63); + + + __m512 x0, x1, x2, x3; + __m512 y0, y1, y2, y3; + __m512 t0, t1, t2, t3; + + for (i = 0; i < tail_index_64; i += 64) { + x0 = _mm512_loadu_ps(&x[i + 0]); + x1 = _mm512_loadu_ps(&x[i +16]); + x2 = _mm512_loadu_ps(&x[i +32]); + x3 = _mm512_loadu_ps(&x[i +48]); + y0 = _mm512_loadu_ps(&y[i + 0]); + y1 = _mm512_loadu_ps(&y[i +16]); + y2 = _mm512_loadu_ps(&y[i +32]); + y3 = _mm512_loadu_ps(&y[i +48]); + + t0 = _mm512_mul_ps(s_512, y0); + t1 = _mm512_mul_ps(s_512, y1); + t2 = _mm512_mul_ps(s_512, y2); + t3 = _mm512_mul_ps(s_512, y3); + + t0 = _mm512_fmadd_ps(c_512, x0, t0); + t1 = _mm512_fmadd_ps(c_512, x1, t1); + t2 = _mm512_fmadd_ps(c_512, x2, t2); + t3 = _mm512_fmadd_ps(c_512, x3, t3); + + _mm512_storeu_ps(&x[i + 0], t0); + _mm512_storeu_ps(&x[i +16], t1); + _mm512_storeu_ps(&x[i +32], t2); + _mm512_storeu_ps(&x[i +48], t3); + + t0 = _mm512_mul_ps(s_512, x0); + t1 = _mm512_mul_ps(s_512, x1); + t2 = _mm512_mul_ps(s_512, x2); + t3 = _mm512_mul_ps(s_512, x3); + + t0 = _mm512_fmsub_ps(c_512, y0, t0); + t1 = _mm512_fmsub_ps(c_512, y1, t1); + t2 = _mm512_fmsub_ps(c_512, y2, t2); + t3 = _mm512_fmsub_ps(c_512, y3, t3); + + _mm512_storeu_ps(&y[i + 0], t0); + _mm512_storeu_ps(&y[i +16], t1); + _mm512_storeu_ps(&y[i +32], t2); + _mm512_storeu_ps(&y[i +48], t3); + } + + for (i = tail_index_64; i < tail_index_16; i += 16) { + x0 = _mm512_loadu_ps(&x[i]); + y0 = _mm512_loadu_ps(&y[i]); + + t0 = _mm512_mul_ps(s_512, y0); + t0 = _mm512_fmadd_ps(c_512, x0, t0); + _mm512_storeu_ps(&x[i], t0); + + t0 = _mm512_mul_ps(s_512, x0); + t0 = _mm512_fmsub_ps(c_512, y0, t0); + _mm512_storeu_ps(&y[i], t0); + } + + + if ((n & 15) > 0) { + uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16-(n&15))); + __m512 tail_x = _mm512_maskz_loadu_ps(*((__mmask16*)&tail_mask16), &x[tail_index_16]); + __m512 tail_y = _mm512_maskz_loadu_ps(*((__mmask16*)&tail_mask16), &y[tail_index_16]); + __m512 temp = _mm512_mul_ps(s_512, tail_y); + temp = _mm512_fmadd_ps(c_512, tail_x, temp); + _mm512_mask_storeu_ps(&x[tail_index_16], *((__mmask16*)&tail_mask16), temp); + temp = _mm512_mul_ps(s_512, tail_x); + temp = _mm512_fmsub_ps(c_512, tail_y, temp); + _mm512_mask_storeu_ps(&y[tail_index_16], *((__mmask16*)&tail_mask16), temp); + } +} +#endif From 725ffbf041b021d2f3602b2313e4027aab19ee89 Mon Sep 17 00:00:00 2001 From: Gengxin Xie Date: Thu, 5 Nov 2020 16:25:17 +0800 Subject: [PATCH 02/12] fix typo --- kernel/x86_64/srot_microk_haswell-2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/srot_microk_haswell-2.c b/kernel/x86_64/srot_microk_haswell-2.c index cba962042..8e245cc8f 100644 --- a/kernel/x86_64/srot_microk_haswell-2.c +++ b/kernel/x86_64/srot_microk_haswell-2.c @@ -70,7 +70,7 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) y0 = _mm256_loadu_ps(&y[i]); t0 = _mm256_mul_ps(s_256, y0); - t0 = _mm256_fmadd_ps(c_256, s0, t0); + t0 = _mm256_fmadd_ps(c_256, x0, t0); _mm256_storeu_ps(&x[i], t0); t0 = _mm256_mul_ps(s_256, x0); From 438a8e5624ef1adfe98f989655ca398866143458 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 7 Nov 2020 20:26:12 +0100 Subject: [PATCH 03/12] Fix placement of getarch call and spurious cpu property accumulation in DYNAMIC_ARCH builds --- cmake/prebuild.cmake | 45 ++++++---------- cmake/system.cmake | 124 ++++++++++++++++++++----------------------- 2 files changed, 73 insertions(+), 96 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index b1b4c501a..da7686c33 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -139,36 +139,6 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(CGEMM3M_UNROLL_N 4) set(ZGEMM3M_UNROLL_M 4) set(ZGEMM3M_UNROLL_N 4) - elseif ("${TCORE}" STREQUAL "BARCELONA") - file(APPEND ${TARGET_CONF_TEMP} - "#define HAVE_SSE3\n") - elseif ("${TCORE}" STREQUAL "STEAMROLLER") - file(APPEND ${TARGET_CONF_TEMP} - "#define HAVE_SSE3\n") - elseif ("${TCORE}" STREQUAL "EXCAVATOR") - file(APPEND ${TARGET_CONF_TEMP} - "#define HAVE_SSE3\n") - elseif ("${TCORE}" STREQUAL "NEHALEM") - file(APPEND ${TARGET_CONF_TEMP} - "#define HAVE_SSE3\n") - elseif ("${TCORE}" STREQUAL "PRESCOTT") - file(APPEND ${TARGET_CONF_TEMP} - "#define HAVE_SSE3\n") - elseif ("${TCORE}" STREQUAL "SANDYBRIDGE") - file(APPEND ${TARGET_CONF_TEMP} - "#define HAVE_AVX\n") - elseif ("${TCORE}" STREQUAL "HASWELL") - file(APPEND ${TARGET_CONF_TEMP} - "#define HAVE_AVX2\n") - elseif ("${TCORE}" STREQUAL "ZEN") - file(APPEND ${TARGET_CONF_TEMP} - "#define HAVE_AVX2\n") - elseif ("${TCORE}" STREQUAL "SKYLAKEX") - file(APPEND ${TARGET_CONF_TEMP} - "#define HAVE_AVX512\n") - elseif ("${TCORE}" STREQUAL "COOPERLAKE") - file(APPEND ${TARGET_CONF_TEMP} - "#define HAVE_AVX512\n") elseif ("${TCORE}" STREQUAL "ARMV7") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE\t65536\n" @@ -586,6 +556,21 @@ else(NOT CMAKE_CROSSCOMPILING) MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}") endif () endif () + unset (HAVE_AVX2) + unset (HAVE_AVX) + unset (HAVE_FMA3) + unset (HAVE_MMX) + unset (HAVE_SSE) + unset (HAVE_SSE2) + unset (HAVE_SSE3) + unset (HAVE_SSSE3) + unset (HAVE_SSE4A) + unset (HAVE_SSE4_1) + unset (HAVE_SSE4_2) + unset (HAVE_NEON) + unset (HAVE_VFP) + unset (HAVE_VFPV3) + unset (HAVE_VFPV4) message(STATUS "Running getarch") # use the cmake binary w/ the -E param to run a shell command in a cross-platform way diff --git a/cmake/system.cmake b/cmake/system.cmake index 48d206b12..66e95c6d3 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -44,74 +44,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) endif () endif () -if (DEFINED TARGET) - if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512) -# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") - else() - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") - endif() -# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") -# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") -# endif() - endif() - if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") - endif() - if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2) - if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2") - endif() - elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse -msse3 -mavx2") - endif() - endif() - if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2") - endif() - if (${TARGET} STREQUAL "ZEN" AND NOT NO_AVX2) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2") - endif() - if (${TARGET} STREQUAL "SANDYBRIDGE" AND NOT NO_AVX) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx") - endif() - if (${TARGET} STREQUAL "BARCELONA" OR ${TARGET} STREQUAL "STEAMROLLER" OR ${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "EXCAVATOR") - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") - endif() - if (${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "BOBCAT" OR ${TARGET} STREQUAL "OPTERON_SSE3") - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") - endif() - if (${TARGET} STREQUAL "PRESCOTT" OR ${TARGET} STREQUAL "NANO") - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") - endif() - if (${TARGET} STREQUAL "NEHALEM" OR ${TARGET} STREQUAL "ATOM") - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") - endif() - if (${TARGET} STREQUAL "CORE2" OR ${TARGET} STREQUAL "PENRYN" OR ${TARGET} STREQUAL "DUNNINGTON") - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") - endif() - if (DEFINED HAVE_SSE) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse") - endif() - if (DEFINED HAVE_SSE2) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse2") - endif() - if (DEFINED HAVE_SSE3) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") - endif() - if (DEFINED HAVE_SSSE3) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mssse3") - endif() - if (DEFINED HAVE_SSE4_1) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1") - endif() -endif() if (DEFINED TARGET) + message(STATUS "-- -- -- -- -- -- -- -- -- -- -- -- --") message(STATUS "Targeting the ${TARGET} architecture.") set(GETARCH_FLAGS "-DFORCE_${TARGET}") endif () @@ -211,6 +146,63 @@ else() endif () include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") +if (DEFINED TARGET) + if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512) +# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") + else() + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + endif() +# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") +# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") +# endif() + endif() + if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + endif() + if (${TARGET} STREQUAL HASWELL AND NOT NO_AVX2) + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") + endif() + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") + endif() + endif() + if (DEFINED HAVE_AVX) + if (NOT NO_AVX) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx") + endif() + endif() + if (DEFINED HAVE_AVX2) + if (NOT NO_AVX2) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") + endif() + endif() + if (DEFINED HAVE_FMA3) + if (NOT NO_AVX2) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma") + endif() + endif() + if (DEFINED HAVE_SSE) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse") + endif() + if (DEFINED HAVE_SSE2) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse2") + endif() + if (DEFINED HAVE_SSE3) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") + endif() + if (DEFINED HAVE_SSSE3) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mssse3") + endif() + if (DEFINED HAVE_SSE4_1) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1") + endif() +endif() if (DEFINED BINARY) message(STATUS "Compiling a ${BINARY}-bit binary.") endif () From a29338aaa6b364ce99ea30785d1227bd327ce3c7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 7 Nov 2020 20:27:42 +0100 Subject: [PATCH 04/12] Remove extraneous quotes that caused a cmake policy warning --- cmake/cc.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 2f4d1c6d7..b963940d6 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -96,7 +96,7 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "SUN") endif () endif () -if (${CORE} STREQUAL "SKYLAKEX") +if (${CORE} STREQUAL SKYLAKEX) if (NOT DYNAMIC_ARCH) if (NOT NO_AVX512) set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") @@ -104,7 +104,7 @@ if (${CORE} STREQUAL "SKYLAKEX") endif () endif () -if (${CORE} STREQUAL "COOPERLAKE") +if (${CORE} STREQUAL COOPERLAKE) if (NOT DYNAMIC_ARCH) if (NOT NO_AVX512) execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) From ccb9731c7b41b601412b00b73f6da98613d66b7f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 7 Nov 2020 20:30:15 +0100 Subject: [PATCH 05/12] Fix propagation of cpu properties to compiler options --- Makefile.x86_64 | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 49a9a0a23..43bfc9ecd 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -9,9 +9,9 @@ endif endif ifdef HAVE_SSE3 -ifndef DYNAMIC_ARCH CCOMMON_OPT += -msse3 FCOMMON_OPT += -msse3 +endif ifdef HAVE_SSSE3 CCOMMON_OPT += -mssse3 FCOMMON_OPT += -mssse3 @@ -20,7 +20,17 @@ ifdef HAVE_SSE4_1 CCOMMON_OPT += -msse4.1 FCOMMON_OPT += -msse4.1 endif +ifdef HAVE_AVX +CCOMMON_OPT += -mavx +FCOMMON_OPT += -mavx endif +ifdef HAVE_AVX2 +CCOMMON_OPT += -mavx2 +FCOMMON_OPT += -mavx2 +endif +ifdef HAVE_FMA3 +CCOMMON_OPT += -mfma +FCOMMON_OPT += -mfma endif ifeq ($(CORE), SKYLAKEX) @@ -66,8 +76,7 @@ endif endif endif -ifeq ($(CORE), $(filter $(CORE), HASWELL ZEN SKYLAKEX COOPERLAKE)) -ifndef DYNAMIC_ARCH +ifdef HAVE_AVX2 ifndef NO_AVX2 ifeq ($(C_COMPILER), GCC) # AVX2 support was added in 4.7.0 @@ -96,7 +105,6 @@ endif endif endif endif -endif From a04f532edfe65a7e4cf4dfb2dc34d363e2eba065 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 7 Nov 2020 20:37:03 +0100 Subject: [PATCH 06/12] Reset cpu property flags between build cycles in DYNAMIC_ARCH mode --- Makefile.system | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Makefile.system b/Makefile.system index ca302a98a..dc7ed3f3a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -252,6 +252,22 @@ DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf else +undefine HAVE_NEON +undefine HAVE_VFP +undefine HAVE_VFPV3 +undefine HAVE_VFPV4 +undefine HAVE_MMX +undefine HAVE_SSE +undefine HAVE_SSE2 +undefine HAVE_SSE3 +undefine HAVE_SSSE3 +undefine HAVE_SSE4_1 +undefine HAVE_SSE4_2 +undefine HAVE_SSE4A +undefine HAVE_SSE5 +undefine HAVE_AVX +undefine HAVE_AVX2 +undefine HAVE_FMA3 include $(TOPDIR)/Makefile_kernel.conf endif @@ -1522,6 +1538,8 @@ export HAVE_SSE4_2 export HAVE_SSE4A export HAVE_SSE5 export HAVE_AVX +export HAVE_AVX2 +export HAVE_FMA3 export HAVE_VFP export HAVE_VFPV3 export HAVE_VFPV4 From b976a0bf4095fd8b9e80ae3cf0e0f6eab200219e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 7 Nov 2020 20:39:56 +0100 Subject: [PATCH 07/12] Remove previous workaround for compiler flags related to cpu capabilities in x86_64 DYNAMIC_ARCH builds --- kernel/Makefile | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/kernel/Makefile b/kernel/Makefile index e811ed43d..fb1d5d39a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,13 +5,6 @@ endif TOPDIR = .. include $(TOPDIR)/Makefile.system -ifdef HAVE_SSE3 -CFLAGS += -msse3 -endif -ifdef HAVE_SSSE3 -CFLAGS += -mssse3 -endif - ifeq ($(ARCH), power) ifeq ($(C_COMPILER), CLANG) override CFLAGS += -fno-integrated-as @@ -38,12 +31,6 @@ ifdef NO_AVX2 endif ifdef TARGET_CORE - ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE HASWELL NEHALEM ZEN BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) - override CFLAGS += -msse -msse2 -msse3 -mssse3 -msse4.1 -endif - ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),KATMAI COPPERMINE BANIAS NORTHWOOD ATHLON OPTERON)) - override CFLAGS += -msse -msse2 -endif ifeq ($(TARGET_CORE), COOPERLAKE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) ifeq ($(GCCVERSIONGTEQ10), 1) From 6e364981a8af0f72ad9e62a69fe62fdedc18255b Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Sat, 7 Nov 2020 15:21:58 -0600 Subject: [PATCH 08/12] Optimize sdot/ddot for POWER10 This patch makes use of new POWER10 vector pair instructions for loads and stores. --- kernel/power/KERNEL.POWER10 | 6 +- kernel/power/ddot_microk_power10.c | 131 ++++++++++++++++++++++++ kernel/power/ddot_power10.c | 130 ++++++++++++++++++++++++ kernel/power/sdot_microk_power10.c | 135 +++++++++++++++++++++++++ kernel/power/sdot_power10.c | 154 +++++++++++++++++++++++++++++ 5 files changed, 553 insertions(+), 3 deletions(-) create mode 100644 kernel/power/ddot_microk_power10.c create mode 100644 kernel/power/ddot_power10.c create mode 100644 kernel/power/sdot_microk_power10.c create mode 100644 kernel/power/sdot_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 28c39051f..c25cd9f04 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -151,9 +151,9 @@ DCOPYKERNEL = dcopy_power10.c CCOPYKERNEL = ccopy_power10.c ZCOPYKERNEL = zcopy_power10.c # -SDOTKERNEL = sdot.c -DDOTKERNEL = ddot.c -DSDOTKERNEL = sdot.c +SDOTKERNEL = sdot_power10.c +DDOTKERNEL = ddot_power10.c +DSDOTKERNEL = sdot_power10.c ifneq ($(GCCVERSIONGTEQ9),1) CDOTKERNEL = cdot_power9.S else diff --git a/kernel/power/ddot_microk_power10.c b/kernel/power/ddot_microk_power10.c new file mode 100644 index 000000000..3a9865cc0 --- /dev/null +++ b/kernel/power/ddot_microk_power10.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static double ddot_kernel_8 (long n, double *x, double *y) +{ + double dot; + + __asm__ + ( + "dcbt 0, %2 \n\t" + "dcbt 0, %3 \n\t" + + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + "xxlxor 34, 34, 34 \n\t" + "xxlxor 35, 35, 35 \n\t" + "xxlxor 36, 36, 36 \n\t" + "xxlxor 37, 37, 37 \n\t" + "xxlxor 38, 38, 38 \n\t" + "xxlxor 39, 39, 39 \n\t" + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + "lxvp 48, 0(%3) \n\t" + "lxvp 50, 32(%3) \n\t" + "lxvp 52, 64(%3) \n\t" + "lxvp 54, 96(%3) \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmaddadp 32, 40, 48 \n\t" + "xvmaddadp 33, 41, 49 \n\t" + "lxvp 40, 0(%2) \n\t" + "lxvp 48, 0(%3) \n\t" + "xvmaddadp 34, 42, 50 \n\t" + "xvmaddadp 35, 43, 51 \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 50, 32(%3) \n\t" + "xvmaddadp 36, 44, 52 \n\t" + "xvmaddadp 37, 45, 53 \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 52, 64(%3) \n\t" + "xvmaddadp 38, 46, 54 \n\t" + "xvmaddadp 39, 47, 55 \n\t" + "lxvp 46, 96(%2) \n\t" + "lxvp 54, 96(%3) \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmaddadp 32, 40, 48 \n\t" + "xvmaddadp 33, 41, 49 \n\t" + "xvmaddadp 34, 42, 50 \n\t" + "xvmaddadp 35, 43, 51 \n\t" + "xvmaddadp 36, 44, 52 \n\t" + "xvmaddadp 37, 45, 53 \n\t" + "xvmaddadp 38, 46, 54 \n\t" + "xvmaddadp 39, 47, 55 \n\t" + + "xvadddp 32, 32, 33 \n\t" + "xvadddp 34, 34, 35 \n\t" + "xvadddp 36, 36, 37 \n\t" + "xvadddp 38, 38, 39 \n\t" + + "xvadddp 32, 32, 34 \n\t" + "xvadddp 36, 36, 38 \n\t" + + "xvadddp 32, 32, 36 \n\t" + + XXSWAPD_S(33,32) + + "xsadddp %x0, 32, 33 \n" + + "#dot=%0 n=%1 x=%4=%2 y=%5=%3\n" + : + "=d" (dot), // 0 + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "m" (*x), + "m" (*y) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55" + ); + + return dot; +} diff --git a/kernel/power/ddot_power10.c b/kernel/power/ddot_power10.c new file mode 100644 index 000000000..302dceb68 --- /dev/null +++ b/kernel/power/ddot_power10.c @@ -0,0 +1,130 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "ddot_microk_power10.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static FLOAT ddot_kernel_8 (BLASLONG n, FLOAT *x, FLOAT *y) +{ + BLASLONG register i = 0; + FLOAT dot = 0.0; + + while(i < n) + { + dot += y[i] * x[i] + + y[i+1] * x[i+1] + + y[i+2] * x[i+2] + + y[i+3] * x[i+3] + + y[i+4] * x[i+4] + + y[i+5] * x[i+5] + + y[i+6] * x[i+6] + + y[i+7] * x[i+7] ; + + i+=8 ; + + } + return dot; +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT dot = 0.0 ; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -16; + + if ( n1 ) + dot = ddot_kernel_8(n1, x, y); + + i = n1; + while(i < n) + { + + dot += y[i] * x[i] ; + i++ ; + + } + return(dot); + + + } + + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = y[iy] * x[ix] ; + FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ; + + FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; + FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + + temp1 += m1+m3; + temp2 += m2+m4; + + i+=4 ; + + } + + while(i < n) + { + + temp1 += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + dot = temp1 + temp2; + return(dot); + +} + + diff --git a/kernel/power/sdot_microk_power10.c b/kernel/power/sdot_microk_power10.c new file mode 100644 index 000000000..2f028c5a0 --- /dev/null +++ b/kernel/power/sdot_microk_power10.c @@ -0,0 +1,135 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static float sdot_kernel_16 (long n, float *x, float *y) +{ + float dot; + + __asm__ + ( + "dcbt 0, %2 \n\t" + "dcbt 0, %3 \n\t" + + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + "xxlxor 34, 34, 34 \n\t" + "xxlxor 35, 35, 35 \n\t" + "xxlxor 36, 36, 36 \n\t" + "xxlxor 37, 37, 37 \n\t" + "xxlxor 38, 38, 38 \n\t" + "xxlxor 39, 39, 39 \n\t" + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + "lxvp 48, 0(%3) \n\t" + "lxvp 50, 32(%3) \n\t" + "lxvp 52, 64(%3) \n\t" + "lxvp 54, 96(%3) \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -32 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmaddasp 32, 40, 48 \n\t" + "xvmaddasp 33, 41, 49 \n\t" + "lxvp 40, 0(%2) \n\t" + "lxvp 48, 0(%3) \n\t" + "xvmaddasp 34, 42, 50 \n\t" + "xvmaddasp 35, 43, 51 \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 50, 32(%3) \n\t" + "xvmaddasp 36, 44, 52 \n\t" + "xvmaddasp 37, 45, 53 \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 52, 64(%3) \n\t" + "xvmaddasp 38, 46, 54 \n\t" + "xvmaddasp 39, 47, 55 \n\t" + "lxvp 46, 96(%2) \n\t" + "lxvp 54, 96(%3) \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -32 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmaddasp 32, 40, 48 \n\t" + "xvmaddasp 33, 41, 49 \n\t" + "xvmaddasp 34, 42, 50 \n\t" + "xvmaddasp 35, 43, 51 \n\t" + "xvmaddasp 36, 44, 52 \n\t" + "xvmaddasp 37, 45, 53 \n\t" + "xvmaddasp 38, 46, 54 \n\t" + "xvmaddasp 39, 47, 55 \n\t" + + "xvaddsp 32, 32, 33 \n\t" + "xvaddsp 34, 34, 35 \n\t" + "xvaddsp 36, 36, 37 \n\t" + "xvaddsp 38, 38, 39 \n\t" + + "xvaddsp 32, 32, 34 \n\t" + "xvaddsp 36, 36, 38 \n\t" + + "xvaddsp 32, 32, 36 \n\t" + + "xxsldwi 33, 32, 32, 2 \n\t" + "xvaddsp 32, 32, 33 \n\t" + + "xxsldwi 33, 32, 32, 1 \n\t" + "xvaddsp 32, 32, 33 \n\t" + + "xscvspdp %x0, 32 \n" + + "#dot=%0 n=%1 x=%4=%2 y=%5=%3\n" + : + "=f" (dot), // 0 + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "m" (*x), + "m" (*y) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55" + ); + + return dot; +} diff --git a/kernel/power/sdot_power10.c b/kernel/power/sdot_power10.c new file mode 100644 index 000000000..b61f0a90d --- /dev/null +++ b/kernel/power/sdot_power10.c @@ -0,0 +1,154 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "sdot_microk_power10.c" +#endif + + +#ifndef HAVE_KERNEL_16 + +static FLOAT sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +{ + BLASLONG register i = 0; + FLOAT dot = 0.0; + + while(i < n) + { + dot += y[i] * x[i] + + y[i+1] * x[i+1] + + y[i+2] * x[i+2] + + y[i+3] * x[i+3] + + y[i+4] * x[i+4] + + y[i+5] * x[i+5] + + y[i+6] * x[i+6] + + y[i+7] * x[i+7] ; + + i+=8 ; + + } + return dot; +} + +#endif + +#if defined (DSDOT) +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + double dot = 0.0 ; + +#if defined (DSDOT) + double mydot = 0.0; + FLOAT asmdot = 0.0; +#else + FLOAT mydot=0.0; +#endif + BLASLONG n1; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + n1 = n & (BLASLONG)(-32); + + if ( n1 ) +#if defined(DSDOT) + { + FLOAT *x1=x; + FLOAT *y1=y; + BLASLONG n2 = 32; + while (i Date: Sat, 7 Nov 2020 23:37:21 +0100 Subject: [PATCH 09/12] Update Makefile.system --- Makefile.system | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.system b/Makefile.system index dc7ed3f3a..258a84262 100644 --- a/Makefile.system +++ b/Makefile.system @@ -252,7 +252,9 @@ DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf else +ifdef HAVE_NEON undefine HAVE_NEON +endif undefine HAVE_VFP undefine HAVE_VFPV3 undefine HAVE_VFPV4 From f6a57d8f63ed0f1fa4823d27daafc2cb3a6dc96b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 8 Nov 2020 00:01:36 +0100 Subject: [PATCH 10/12] Update Makefile.system --- Makefile.system | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Makefile.system b/Makefile.system index 258a84262..da2d452b2 100644 --- a/Makefile.system +++ b/Makefile.system @@ -255,9 +255,15 @@ else ifdef HAVE_NEON undefine HAVE_NEON endif +ifdef HAVE_VFP undefine HAVE_VFP +endif +ifdef HAVE_VFPV3 undefine HAVE_VFPV3 +endif +ifdef HAVE_VFPV4 undefine HAVE_VFPV4 +endif undefine HAVE_MMX undefine HAVE_SSE undefine HAVE_SSE2 From 1c4cfdc13937765dd9bd0ef8b846ba027ec086b3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 8 Nov 2020 00:12:55 +0100 Subject: [PATCH 11/12] Stay compatible with old gmake that did not support undefine --- Makefile.system | 42 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/Makefile.system b/Makefile.system index da2d452b2..aae7ba503 100644 --- a/Makefile.system +++ b/Makefile.system @@ -6,7 +6,7 @@ INCLUDED = 1 ifndef TOPDIR -TOPDIR = . +TOPDIR = . endif # If ARCH is not set, we use the host system's architecture for getarch compile options. @@ -252,30 +252,22 @@ DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf else -ifdef HAVE_NEON -undefine HAVE_NEON -endif -ifdef HAVE_VFP -undefine HAVE_VFP -endif -ifdef HAVE_VFPV3 -undefine HAVE_VFPV3 -endif -ifdef HAVE_VFPV4 -undefine HAVE_VFPV4 -endif -undefine HAVE_MMX -undefine HAVE_SSE -undefine HAVE_SSE2 -undefine HAVE_SSE3 -undefine HAVE_SSSE3 -undefine HAVE_SSE4_1 -undefine HAVE_SSE4_2 -undefine HAVE_SSE4A -undefine HAVE_SSE5 -undefine HAVE_AVX -undefine HAVE_AVX2 -undefine HAVE_FMA3 +HAVE_NEON= +HAVE_VFP= +HAVE_VFPV3= +HAVE_VFPV4= +HAVE_MMX= +HAVE_SSE= +HAVE_SSE2= +HAVE_SSE3= +HAVE_SSSE3= +HAVE_SSE4_1= +HAVE_SSE4_2= +HAVE_SSE4A= +HAVE_SSE5= +HAVE_AVX= +HAVE_AVX2= +HAVE_FMA3= include $(TOPDIR)/Makefile_kernel.conf endif From ec088bf33aa3034a82b713ea304fe30e36c278ec Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 8 Nov 2020 13:15:40 +0100 Subject: [PATCH 12/12] Fix missing AVX2 and FMA3 capabilities in FORCE_target mode --- getarch.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/getarch.c b/getarch.c index ab90f36d9..daf669e56 100644 --- a/getarch.c +++ b/getarch.c @@ -330,7 +330,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ - "-DFMA3" + "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" #define LIBNAME "haswell" #define CORENAME "HASWELL" #endif @@ -346,7 +346,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ - "-DFMA3" + "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" #define LIBNAME "haswell" #define CORENAME "HASWELL" #else @@ -359,7 +359,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ - "-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512" + "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -march=skylake-avx512" #define LIBNAME "skylakex" #define CORENAME "SKYLAKEX" #endif @@ -376,7 +376,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ - "-DFMA3" + "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" #define LIBNAME "haswell" #define CORENAME "HASWELL" #else @@ -389,7 +389,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ - "-DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake" + "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake" #define LIBNAME "cooperlake" #define CORENAME "COOPERLAKE" #endif @@ -559,7 +559,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ - "-DHAVE_AVX -DHAVE_FMA3 -DFMA3" + "-DHAVE_AVX -DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" #define LIBNAME "zen" #define CORENAME "ZEN" #endif