From 28e96458e5a4b2d8039ed16048a07892a7c960bf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 22 Jul 2019 08:28:16 +0200 Subject: [PATCH] Replace vpermpd with vpermilpd to improve performance on Zen/Zen2 (as demonstrated by wjc404 in #2180) --- kernel/x86_64/zdot_microk_haswell-2.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c index 9f2fc2c1d..4eade7bfd 100644 --- a/kernel/x86_64/zdot_microk_haswell-2.c +++ b/kernel/x86_64/zdot_microk_haswell-2.c @@ -66,13 +66,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" - "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x05 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t" +// "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" +// "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" - "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + "vpermilpd $0x05 , %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t" +// "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" +// "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" "vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r "addq $16 , %0 \n\t" @@ -151,13 +155,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" - "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x05 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t" +// "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" +// "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" - "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + "vpermilpd $0x05 , %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t" +// "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" +// "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" "vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r "addq $16 , %0 \n\t"