From 5ceca1a4d872d98d141c1f1b3512c82f37057b6a Mon Sep 17 00:00:00 2001 From: Bart Oldeman Date: Tue, 6 Dec 2022 14:05:49 -0500 Subject: [PATCH] Add sscal.c + microkernels for Haswell, Zen, Skylake and newer. Unlike [dcz]scal, sscal still used the original GotoBLAS SSE code from scal_sse.S. This code follows dscal as closely as possible, except for the inc_x > 1 code for which a plain C loop is used much like the one in cscal.c, instead of an adaptation of the SSE2 asm code of dscal.c (I tried but the performance wasn't better than the plain C loop). --- kernel/x86_64/KERNEL.HASWELL | 1 + kernel/x86_64/KERNEL.ZEN | 1 + kernel/x86_64/sscal.c | 196 ++++++++++++++++++++++++ kernel/x86_64/sscal_microk_haswell-2.c | 180 ++++++++++++++++++++++ kernel/x86_64/sscal_microk_skylakex-2.c | 86 +++++++++++ 5 files changed, 464 insertions(+) create mode 100644 kernel/x86_64/sscal.c create mode 100644 kernel/x86_64/sscal_microk_haswell-2.c create mode 100644 kernel/x86_64/sscal_microk_skylakex-2.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 81eaf96ac..aaf686c9f 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,3 +1,4 @@ +SSCALKERNEL = sscal.c DSCALKERNEL = dscal.c CSCALKERNEL = cscal.c ZSCALKERNEL = zscal.c diff --git a/kernel/x86_64/KERNEL.ZEN b/kernel/x86_64/KERNEL.ZEN index a66394be3..9978202a7 100644 --- a/kernel/x86_64/KERNEL.ZEN +++ b/kernel/x86_64/KERNEL.ZEN @@ -1,3 +1,4 @@ +SSCALKERNEL = sscal.c DSCALKERNEL = dscal.c CSCALKERNEL = cscal.c ZSCALKERNEL = zscal.c diff --git a/kernel/x86_64/sscal.c b/kernel/x86_64/sscal.c new file mode 100644 index 000000000..af1220f1b --- /dev/null +++ b/kernel/x86_64/sscal.c @@ -0,0 +1,196 @@ +/*************************************************************************** +Copyright (c) 2013 - 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(HASWELL) || defined(ZEN) +#include "sscal_microk_haswell-2.c" +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) +#include "sscal_microk_skylakex-2.c" +#endif + + +#if !defined(HAVE_KERNEL_16) + +static void sscal_kernel_16( BLASLONG n, FLOAT *da , FLOAT *x ) +{ + + BLASLONG i; + FLOAT alpha = *da; + + for( i=0; i 0 ) + { + sscal_kernel_inc_8(n1, &da, x, inc_x); + i = n1 * inc_x; + j = n1; + } + + while(j < n) + { + + x[i] *= da; + i += inc_x ; + j++; + + } + + } + + return(0); + } + + BLASLONG n1 = n & -16; + if ( n1 > 0 ) + { + if ( da == 0.0 ) + sscal_kernel_16_zero(n1 , &da , x); + else + sscal_kernel_16(n1 , &da , x); + } + + if ( da == 0.0 ) + { + for ( i=n1 ; i> 5 ; + BLASLONG n2 = n & 16 ; + + __asm__ __volatile__ + ( + "vbroadcastss (%2), %%ymm0 \n\t" // alpha + + "addq $128, %1 \n\t" + + "cmpq $0, %0 \n\t" + "je 4f \n\t" + + "vmulps -128(%1), %%ymm0, %%ymm4 \n\t" + "vmulps -96(%1), %%ymm0, %%ymm5 \n\t" + + "vmulps -64(%1), %%ymm0, %%ymm6 \n\t" + "vmulps -32(%1), %%ymm0, %%ymm7 \n\t" + + "subq $1 , %0 \n\t" + "jz 2f \n\t" + + ".p2align 4 \n\t" + "1: \n\t" + // "prefetcht0 640(%1) \n\t" + + "vmovups %%ymm4 ,-128(%1) \n\t" + "vmovups %%ymm5 , -96(%1) \n\t" + "vmulps 0(%1), %%ymm0, %%ymm4 \n\t" + + // "prefetcht0 704(%1) \n\t" + + "vmovups %%ymm6 , -64(%1) \n\t" + "vmulps 32(%1), %%ymm0, %%ymm5 \n\t" + "vmovups %%ymm7 , -32(%1) \n\t" + + "vmulps 64(%1), %%ymm0, %%ymm6 \n\t" + "vmulps 96(%1), %%ymm0, %%ymm7 \n\t" + + + "addq $128, %1 \n\t" + "subq $1 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmovups %%ymm4 ,-128(%1) \n\t" + "vmovups %%ymm5 , -96(%1) \n\t" + + "vmovups %%ymm6 , -64(%1) \n\t" + "vmovups %%ymm7 , -32(%1) \n\t" + + "addq $128, %1 \n\t" + + "4: \n\t" + + "cmpq $16 ,%3 \n\t" + "jne 5f \n\t" + + "vmulps -128(%1), %%ymm0, %%ymm4 \n\t" + "vmulps -96(%1), %%ymm0, %%ymm5 \n\t" + + "vmovups %%ymm4 ,-128(%1) \n\t" + "vmovups %%ymm5 , -96(%1) \n\t" + + "5: \n\t" + + "vzeroupper \n\t" + + : + "+r" (n1), // 0 + "+r" (x) // 1 + : + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%ymm14", "%xmm15", + "memory" + ); + +} + + +static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG n1 = n >> 5 ; + BLASLONG n2 = n & 16 ; + + __asm__ __volatile__ + ( + "vxorpd %%ymm0, %%ymm0 , %%ymm0 \n\t" + + "addq $128, %1 \n\t" + + "cmpq $0, %0 \n\t" + "je 2f \n\t" + + ".p2align 4 \n\t" + "1: \n\t" + + "vmovups %%ymm0 ,-128(%1) \n\t" + "vmovups %%ymm0 , -96(%1) \n\t" + + "vmovups %%ymm0 , -64(%1) \n\t" + "vmovups %%ymm0 , -32(%1) \n\t" + + "addq $128, %1 \n\t" + "subq $1 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "cmpq $16 ,%3 \n\t" + "jne 4f \n\t" + + "vmovups %%ymm0 ,-128(%1) \n\t" + "vmovups %%ymm0 , -96(%1) \n\t" + + "4: \n\t" + + "vzeroupper \n\t" + + : + "+r" (n1), // 0 + "+r" (x) // 1 + : + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/sscal_microk_skylakex-2.c b/kernel/x86_64/sscal_microk_skylakex-2.c new file mode 100644 index 000000000..31790000d --- /dev/null +++ b/kernel/x86_64/sscal_microk_skylakex-2.c @@ -0,0 +1,86 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#include + +#define HAVE_KERNEL_16 1 + +static void sscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + int i = 0; + +#ifdef __AVX512CD__ + __m512 __alpha5 = _mm512_broadcastss_ps(_mm_load_ss(alpha)); + BLASLONG nn = n & -32; + for (; i < nn; i += 32) { + __m512 a = _mm512_loadu_ps(&x[i + 0]); + __m512 b = _mm512_loadu_ps(&x[i + 16]); + a *= __alpha5; + b *= __alpha5; + _mm512_storeu_ps(&x[i + 0], a); + _mm512_storeu_ps(&x[i + 16], b); + } + for (; i < n; i += 16) { + _mm512_storeu_ps(&x[i + 0], __alpha5 * _mm512_loadu_ps(&x[i + 0])); + } +#else + __m256 __alpha = _mm256_broadcastss_ps(_mm_load_ss(alpha)); + for (; i < n; i += 16) { + _mm256_storeu_ps(&x[i + 0], __alpha * _mm256_loadu_ps(&x[i + 0])); + _mm256_storeu_ps(&x[i + 8], __alpha * _mm256_loadu_ps(&x[i + 8])); + } +#endif +} + + +static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + int i = 0; + + /* question to self: Why is this not just memset() */ + +#ifdef __AVX512CD__ + __m512 zero = _mm512_setzero_ps(); + for (; i < n; i += 16) { + _mm512_storeu_ps(&x[i], zero); + } +#else + __m256 zero = _mm256_setzero_ps(); + for (; i < n; i += 16) { + _mm256_storeu_ps(&x[i + 0], zero); + _mm256_storeu_ps(&x[i + 8], zero); + } +#endif + +} + +#else +#include "dscal_microk_haswell-2.c" +#endif