From 93aa18b1a806d7f2749aace0b7c88831a4c576cc Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Aug 2018 18:29:34 +0000 Subject: [PATCH] daxpy_haswell: Change to C+instrinsics + AVX512 to mimic the change to saxpy_haswell Use the same transformation as was done to saxpy for daxpy gives a low double digit performance increase --- kernel/x86_64/daxpy_microk_haswell-2.c | 85 ++++++++++++-------------- 1 file changed, 40 insertions(+), 45 deletions(-) diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c index bbe8b9550..c2491ba9b 100644 --- a/kernel/x86_64/daxpy_microk_haswell-2.c +++ b/kernel/x86_64/daxpy_microk_haswell-2.c @@ -25,54 +25,49 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ + + +#ifndef __AVX512CD__ +#pragma GCC target("avx2,fma") +#endif + +#ifdef __AVX2__ + +#include + #define HAVE_KERNEL_8 1 -static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { - - - BLASLONG register i = 0; - - __asm__ __volatile__ - ( - "vbroadcastsd (%4), %%ymm0 \n\t" // alpha - - ".p2align 4 \n\t" - "1: \n\t" - - "vmovups (%3,%0,8), %%ymm12 \n\t" // 4 * y - "vmovups 32(%3,%0,8), %%ymm13 \n\t" // 4 * y - "vmovups 64(%3,%0,8), %%ymm14 \n\t" // 4 * y - "vmovups 96(%3,%0,8), %%ymm15 \n\t" // 4 * y - "vfmadd231pd (%2,%0,8), %%ymm0 , %%ymm12 \n\t" // y += alpha * x - "vfmadd231pd 32(%2,%0,8), %%ymm0 , %%ymm13 \n\t" // y += alpha * x - "vfmadd231pd 64(%2,%0,8), %%ymm0 , %%ymm14 \n\t" // y += alpha * x - "vfmadd231pd 96(%2,%0,8), %%ymm0 , %%ymm15 \n\t" // y += alpha * x - "vmovups %%ymm12, (%3,%0,8) \n\t" - "vmovups %%ymm13, 32(%3,%0,8) \n\t" - "vmovups %%ymm14, 64(%3,%0,8) \n\t" - "vmovups %%ymm15, 96(%3,%0,8) \n\t" - - "addq $16, %0 \n\t" - "subq $16, %1 \n\t" - "jnz 1b \n\t" - "vzeroupper \n\t" - - : - : - "r" (i), // 0 - "r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - : "cc", - "%xmm0", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} + BLASLONG i = 0; + + __m256d __alpha; + + __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); + +#ifdef __AVX512CD__ + BLASLONG n32; + __m512d __alpha5; + __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha)); + + n32 = n & ~31; + + for (; i < n32; i+= 32) { + _mm512_storeu_pd(&y[i + 0], _mm512_loadu_pd(&y[i + 0]) + __alpha5 * _mm512_loadu_pd(&x[i + 0])); + _mm512_storeu_pd(&y[i + 8], _mm512_loadu_pd(&y[i + 8]) + __alpha5 * _mm512_loadu_pd(&x[i + 8])); + _mm512_storeu_pd(&y[i + 16], _mm512_loadu_pd(&y[i + 16]) + __alpha5 * _mm512_loadu_pd(&x[i + 16])); + _mm512_storeu_pd(&y[i + 24], _mm512_loadu_pd(&y[i + 24]) + __alpha5 * _mm512_loadu_pd(&x[i + 24])); + } + +#endif + + for (; i < n; i+= 16) { + _mm256_storeu_pd(&y[i + 0], _mm256_loadu_pd(&y[i + 0]) + __alpha * _mm256_loadu_pd(&x[i + 0])); + _mm256_storeu_pd(&y[i + 4], _mm256_loadu_pd(&y[i + 4]) + __alpha * _mm256_loadu_pd(&x[i + 4])); + _mm256_storeu_pd(&y[i + 8], _mm256_loadu_pd(&y[i + 8]) + __alpha * _mm256_loadu_pd(&x[i + 8])); + _mm256_storeu_pd(&y[i + 12], _mm256_loadu_pd(&y[i + 12]) + __alpha * _mm256_loadu_pd(&x[i + 12])); + } +} +#endif