From 350531e76a25c7cf9e9007d52c272760e35e9151 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 4 Aug 2018 14:44:04 +0000 Subject: [PATCH 01/19] dgemv_n_microk_haswell: Use symbolic names for asm inputs to make the code more readable gcc assembly syntax supports symbolic names in addition to numeric parameter order; it's generally more readable to have code use the symbolic names --- kernel/x86_64/dgemv_n_microk_haswell-4.c | 60 ++++++++++++------------ 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c index 584a6c6b5..f9995c5e1 100644 --- a/kernel/x86_64/dgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c @@ -37,19 +37,19 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT __asm__ __volatile__ ( - "vbroadcastsd (%2), %%ymm12 \n\t" // x0 - "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 - "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 - "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 + "vbroadcastsd (%[x]), %%ymm12 \n\t" // x0 + "vbroadcastsd 8(%[x]), %%ymm13 \n\t" // x1 + "vbroadcastsd 16(%[x]), %%ymm14 \n\t" // x2 + "vbroadcastsd 24(%[x]), %%ymm15 \n\t" // x3 - "vmovups (%4,%0,8), %%ymm0 \n\t" - "vmovups (%5,%0,8), %%ymm1 \n\t" - "vmovups (%6,%0,8), %%ymm2 \n\t" - "vmovups (%7,%0,8), %%ymm3 \n\t" - "vbroadcastsd (%8), %%ymm6 \n\t" // alpha + "vmovups (%[ap0],%[i],8), %%ymm0 \n\t" + "vmovups (%[ap1],%[i],8), %%ymm1 \n\t" + "vmovups (%[ap2],%[i],8), %%ymm2 \n\t" + "vmovups (%[ap3],%[i],8), %%ymm3 \n\t" + "vbroadcastsd (%[alpha]), %%ymm6 \n\t" // alpha - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" + "addq $4 , %[i] \n\t" + "subq $4 , %[n] \n\t" "jz 2f \n\t" // ".align 16 \n\t" @@ -57,21 +57,21 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t" "vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t" - "vmovups (%4,%0,8), %%ymm0 \n\t" - "vmovups (%5,%0,8), %%ymm1 \n\t" + "vmovups (%[ap0],%[i],8), %%ymm0 \n\t" + "vmovups (%[ap1],%[i],8), %%ymm1 \n\t" "vfmadd231pd %%ymm2 , %%ymm14, %%ymm4 \n\t" "vfmadd231pd %%ymm3 , %%ymm15, %%ymm5 \n\t" - "vmovups (%6,%0,8), %%ymm2 \n\t" - "vmovups (%7,%0,8), %%ymm3 \n\t" + "vmovups (%[ap2],%[i],8), %%ymm2 \n\t" + "vmovups (%[ap3],%[i],8), %%ymm3 \n\t" - "vmovups -32(%3,%0,8), %%ymm8 \n\t" // 4 * y + "vmovups -32(%[y],%[i],8), %%ymm8 \n\t" // 4 * y "vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t" "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" - "vmovups %%ymm8, -32(%3,%0,8) \n\t" // 4 * y + "vmovups %%ymm8, -32(%[y],%[i],8) \n\t" // 4 * y - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" + "addq $4 , %[i] \n\t" + "subq $4 , %[n] \n\t" "jnz 1b \n\t" @@ -83,26 +83,26 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vfmadd231pd %%ymm3 , %%ymm15, %%ymm5 \n\t" - "vmovups -32(%3,%0,8), %%ymm8 \n\t" // 4 * y + "vmovups -32(%[y],%[i],8), %%ymm8 \n\t" // 4 * y "vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t" "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" - "vmovups %%ymm8, -32(%3,%0,8) \n\t" // 4 * y + "vmovups %%ymm8, -32(%[y],%[i],8) \n\t" // 4 * y "vzeroupper \n\t" : - "+r" (i), // 0 - "+r" (n) // 1 + [i] "+r" (i), // 0 + [n] "+r" (n) // 1 : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (alpha) // 8 + [x] "r" (x), // 2 + [y] "r" (y), // 3 + [ap0] "r" (ap[0]), // 4 + [ap1] "r" (ap[1]), // 5 + [ap2] "r" (ap[2]), // 6 + [ap3] "r" (ap[3]), // 7 + [alpha] "r" (alpha) // 8 : "cc", "%xmm4", "%xmm5", "%xmm6", "%xmm7", From 4a8ae8b8aacb7de466183b5637632bdeeccbd262 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 4 Aug 2018 17:25:54 +0000 Subject: [PATCH 02/19] replace the hasell dgemv_kernel_4x4 kernel with a the same code written in intrinsics using intrinsics is a bit easier to read (at least for the non-math part of the code) and also allows the compiler to be better about register allocation and optimizing the non-math (loop/setup) code. It also allows the code to honor the "no fma" flag if the user so desires. The result of this change is (measured for a size of 16) a 15% performance increase. And it is a step towards being able to add an AVX512 version of the code. --- kernel/x86_64/dgemv_n_microk_haswell-4.c | 96 ++++++------------------ 1 file changed, 25 insertions(+), 71 deletions(-) diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c index f9995c5e1..e3221d23f 100644 --- a/kernel/x86_64/dgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c @@ -28,88 +28,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define HAVE_KERNEL_4x4 1 -static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +#include static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { - BLASLONG register i = 0; + int i = 0; - __asm__ __volatile__ - ( - "vbroadcastsd (%[x]), %%ymm12 \n\t" // x0 - "vbroadcastsd 8(%[x]), %%ymm13 \n\t" // x1 - "vbroadcastsd 16(%[x]), %%ymm14 \n\t" // x2 - "vbroadcastsd 24(%[x]), %%ymm15 \n\t" // x3 + __m256d x0, x1, x2, x3; + __m256d __alpha; - "vmovups (%[ap0],%[i],8), %%ymm0 \n\t" - "vmovups (%[ap1],%[i],8), %%ymm1 \n\t" - "vmovups (%[ap2],%[i],8), %%ymm2 \n\t" - "vmovups (%[ap3],%[i],8), %%ymm3 \n\t" - "vbroadcastsd (%[alpha]), %%ymm6 \n\t" // alpha + x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0])); + x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1])); + x2 = _mm256_broadcastsd_pd(_mm_load_sd(&x[2])); + x3 = _mm256_broadcastsd_pd(_mm_load_sd(&x[3])); - "addq $4 , %[i] \n\t" - "subq $4 , %[n] \n\t" - "jz 2f \n\t" - - // ".align 16 \n\t" - "1: \n\t" - - "vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t" - "vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t" - "vmovups (%[ap0],%[i],8), %%ymm0 \n\t" - "vmovups (%[ap1],%[i],8), %%ymm1 \n\t" - "vfmadd231pd %%ymm2 , %%ymm14, %%ymm4 \n\t" - "vfmadd231pd %%ymm3 , %%ymm15, %%ymm5 \n\t" - "vmovups (%[ap2],%[i],8), %%ymm2 \n\t" - "vmovups (%[ap3],%[i],8), %%ymm3 \n\t" - - "vmovups -32(%[y],%[i],8), %%ymm8 \n\t" // 4 * y - "vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t" - "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" - - "vmovups %%ymm8, -32(%[y],%[i],8) \n\t" // 4 * y - - "addq $4 , %[i] \n\t" - "subq $4 , %[n] \n\t" - "jnz 1b \n\t" - - - "2: \n\t" - - "vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t" - "vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t" - "vfmadd231pd %%ymm2 , %%ymm14, %%ymm4 \n\t" - "vfmadd231pd %%ymm3 , %%ymm15, %%ymm5 \n\t" + __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); - "vmovups -32(%[y],%[i],8), %%ymm8 \n\t" // 4 * y - "vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t" - "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" + for (i = 0; i < n; i+= 4) { + __m256d tempY; + __m256d sum; - "vmovups %%ymm8, -32(%[y],%[i],8) \n\t" // 4 * y + sum = _mm256_add_pd( + _mm256_add_pd( + _mm256_mul_pd(_mm256_loadu_pd(&ap[0][i]), x0), + _mm256_mul_pd(_mm256_loadu_pd(&ap[1][i]), x1)), + _mm256_add_pd( + _mm256_mul_pd(_mm256_loadu_pd(&ap[2][i]), x2), + _mm256_mul_pd(_mm256_loadu_pd(&ap[3][i]), x3)) + ); - - "vzeroupper \n\t" - - : - [i] "+r" (i), // 0 - [n] "+r" (n) // 1 - : - [x] "r" (x), // 2 - [y] "r" (y), // 3 - [ap0] "r" (ap[0]), // 4 - [ap1] "r" (ap[1]), // 5 - [ap2] "r" (ap[2]), // 6 - [ap3] "r" (ap[3]), // 7 - [alpha] "r" (alpha) // 8 - : "cc", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); + tempY = _mm256_loadu_pd(&y[i]); + tempY = _mm256_add_pd(tempY, _mm256_mul_pd(sum, __alpha)); + _mm256_storeu_pd(&y[i], tempY); + } } From e52d01cfe739b17d785d74cf15e6db87deb1690e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 4 Aug 2018 17:53:55 +0000 Subject: [PATCH 03/19] Also make the kernel_4x2 use intrinsics for readability and consistency --- kernel/x86_64/dgemv_n_microk_haswell-4.c | 78 ++++++------------------ 1 file changed, 17 insertions(+), 61 deletions(-) diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c index e3221d23f..80879fdee 100644 --- a/kernel/x86_64/dgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c @@ -70,76 +70,32 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT #define HAVE_KERNEL_4x2 -static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); - static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { - BLASLONG register i = 0; + int i = 0; - __asm__ __volatile__ - ( - "vbroadcastsd (%2), %%ymm12 \n\t" // x0 - "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 + __m256d x0, x1; + __m256d __alpha; - "vmovups (%4,%0,8), %%ymm0 \n\t" - "vmovups (%5,%0,8), %%ymm1 \n\t" + x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0])); + x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1])); - "vbroadcastsd (%6), %%ymm6 \n\t" // alpha - - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" - "jz 2f \n\t" - - "1: \n\t" - - "vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t" - "vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t" - "vmovups (%4,%0,8), %%ymm0 \n\t" - "vmovups (%5,%0,8), %%ymm1 \n\t" - - "vmovups -32(%3,%0,8), %%ymm8 \n\t" // 4 * y - "vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t" - "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" - - "vmovups %%ymm8, -32(%3,%0,8) \n\t" // 4 * y - - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" - "jnz 1b \n\t" - - - "2: \n\t" - - "vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t" - "vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t" + __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); - "vmovups -32(%3,%0,8), %%ymm8 \n\t" // 4 * y - "vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t" - "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" + for (i = 0; i < n; i+= 4) { + __m256d tempY; + __m256d sum; - "vmovups %%ymm8, -32(%3,%0,8) \n\t" // 4 * y + sum = _mm256_add_pd( + _mm256_mul_pd(_mm256_loadu_pd(&ap[0][i]), x0), + _mm256_mul_pd(_mm256_loadu_pd(&ap[1][i]), x1) + ); + tempY = _mm256_loadu_pd(&y[i]); + tempY = _mm256_add_pd(tempY, _mm256_mul_pd(sum, __alpha)); + _mm256_storeu_pd(&y[i], tempY); + } - "vzeroupper \n\t" - - - : - "+r" (i), // 0 - "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (alpha) // 6 - : "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", - "%xmm6", - "%xmm8", - "%xmm12", "%xmm13", - "memory" - ); } From df31ec064eeaaad61586a93f4d165273760a33b5 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 4 Aug 2018 20:48:59 +0000 Subject: [PATCH 04/19] Add AVX512 support to the dgemv_n_microk_haswell-4.c kernel Now that the kernel is written in C-with-intrinsics, adding AVX512 support to this kernel is trivial and yields a pretty significant performance increase --- kernel/x86_64/dgemv_n_microk_haswell-4.c | 40 +++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c index 80879fdee..26c3d1ae8 100644 --- a/kernel/x86_64/dgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c @@ -25,7 +25,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +/* Ensure that the compiler knows how to generate AVX2 instructions if it doesn't already*/ +#ifndef __AVX512CD_ +#pragma GCC target("avx2,fma") +#endif +#ifdef __AVX2__ #define HAVE_KERNEL_4x4 1 @@ -46,8 +51,39 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); +#ifdef __AVX512CD__ + int n5; + __m512d x05, x15, x25, x35; + __m512d __alpha5; + n5 = n & ~7; - for (i = 0; i < n; i+= 4) { + x05 = _mm512_broadcastsd_pd(_mm_load_sd(&x[0])); + x15 = _mm512_broadcastsd_pd(_mm_load_sd(&x[1])); + x25 = _mm512_broadcastsd_pd(_mm_load_sd(&x[2])); + x35 = _mm512_broadcastsd_pd(_mm_load_sd(&x[3])); + + __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha)); + + for (; i < n5; i+= 8) { + __m512d tempY; + __m512d sum; + + sum = _mm512_add_pd( + _mm512_add_pd( + _mm512_mul_pd(_mm512_loadu_pd(&ap[0][i]), x05), + _mm512_mul_pd(_mm512_loadu_pd(&ap[1][i]), x15)), + _mm512_add_pd( + _mm512_mul_pd(_mm512_loadu_pd(&ap[2][i]), x25), + _mm512_mul_pd(_mm512_loadu_pd(&ap[3][i]), x35)) + ); + + tempY = _mm512_loadu_pd(&y[i]); + tempY = _mm512_add_pd(tempY, _mm512_mul_pd(sum, __alpha5)); + _mm512_storeu_pd(&y[i], tempY); + } +#endif + + for (; i < n; i+= 4) { __m256d tempY; __m256d sum; @@ -99,3 +135,5 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT } } + +#endif /* AVX2 */ From 0faba28adb27411db73d00987676e857823b0112 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 4 Aug 2018 21:25:53 +0000 Subject: [PATCH 05/19] dsymv_L haswell: use symbol names for inline asm symbolic names for gcc inline assembly are much easier to read --- kernel/x86_64/dsymv_L_microk_haswell-2.c | 62 ++++++++++++------------ 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/kernel/x86_64/dsymv_L_microk_haswell-2.c b/kernel/x86_64/dsymv_L_microk_haswell-2.c index 866782ee6..2f97bfcd9 100644 --- a/kernel/x86_64/dsymv_L_microk_haswell-2.c +++ b/kernel/x86_64/dsymv_L_microk_haswell-2.c @@ -39,21 +39,21 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vxorpd %%ymm1 , %%ymm1 , %%ymm1 \n\t" // temp2[1] "vxorpd %%ymm2 , %%ymm2 , %%ymm2 \n\t" // temp2[2] "vxorpd %%ymm3 , %%ymm3 , %%ymm3 \n\t" // temp2[3] - "vbroadcastsd (%8), %%ymm4 \n\t" // temp1[0] - "vbroadcastsd 8(%8), %%ymm5 \n\t" // temp1[1] - "vbroadcastsd 16(%8), %%ymm6 \n\t" // temp1[1] - "vbroadcastsd 24(%8), %%ymm7 \n\t" // temp1[1] + "vbroadcastsd (%[temp1]), %%ymm4 \n\t" // temp1[0] + "vbroadcastsd 8(%[temp1]), %%ymm5 \n\t" // temp1[1] + "vbroadcastsd 16(%[temp1]), %%ymm6 \n\t" // temp1[1] + "vbroadcastsd 24(%[temp1]), %%ymm7 \n\t" // temp1[1] ".p2align 4 \n\t" "1: \n\t" - "vmovups (%3,%0,8), %%ymm9 \n\t" // 2 * y - "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x + "vmovups (%[y],%[from],8), %%ymm9 \n\t" // 2 * y + "vmovups (%[x],%[from],8), %%ymm8 \n\t" // 2 * x - "vmovups (%4,%0,8), %%ymm12 \n\t" // 2 * a - "vmovups (%5,%0,8), %%ymm13 \n\t" // 2 * a - "vmovups (%6,%0,8), %%ymm14 \n\t" // 2 * a - "vmovups (%7,%0,8), %%ymm15 \n\t" // 2 * a + "vmovups (%[a0],%[from],8), %%ymm12 \n\t" // 2 * a + "vmovups (%[a1],%[from],8), %%ymm13 \n\t" // 2 * a + "vmovups (%[a2],%[from],8), %%ymm14 \n\t" // 2 * a + "vmovups (%[a3],%[from],8), %%ymm15 \n\t" // 2 * a "vfmadd231pd %%ymm4, %%ymm12 , %%ymm9 \n\t" // y += temp1 * a "vfmadd231pd %%ymm8, %%ymm12 , %%ymm0 \n\t" // temp2 += x * a @@ -66,17 +66,17 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vfmadd231pd %%ymm7, %%ymm15 , %%ymm9 \n\t" // y += temp1 * a "vfmadd231pd %%ymm8, %%ymm15 , %%ymm3 \n\t" // temp2 += x * a - "addq $4 , %0 \n\t" + "addq $4 , %[from] \n\t" - "vmovups %%ymm9 , -32(%3,%0,8) \n\t" + "vmovups %%ymm9 , -32(%[y],%[from],8) \n\t" - "cmpq %0 , %1 \n\t" + "cmpq %[from] , %[to] \n\t" "jnz 1b \n\t" - "vmovsd (%9), %%xmm4 \n\t" - "vmovsd 8(%9), %%xmm5 \n\t" - "vmovsd 16(%9), %%xmm6 \n\t" - "vmovsd 24(%9), %%xmm7 \n\t" + "vmovsd (%[temp2]), %%xmm4 \n\t" + "vmovsd 8(%[temp2]), %%xmm5 \n\t" + "vmovsd 16(%[temp2]), %%xmm6 \n\t" + "vmovsd 24(%[temp2]), %%xmm7 \n\t" "vextractf128 $0x01, %%ymm0 , %%xmm12 \n\t" "vextractf128 $0x01, %%ymm1 , %%xmm13 \n\t" @@ -98,24 +98,24 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t" "vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t" - "vmovsd %%xmm0 , (%9) \n\t" // save temp2 - "vmovsd %%xmm1 , 8(%9) \n\t" // save temp2 - "vmovsd %%xmm2 ,16(%9) \n\t" // save temp2 - "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 + "vmovsd %%xmm0 , (%[temp2]) \n\t" // save temp2 + "vmovsd %%xmm1 , 8(%[temp2]) \n\t" // save temp2 + "vmovsd %%xmm2 ,16(%[temp2]) \n\t" // save temp2 + "vmovsd %%xmm3 ,24(%[temp2]) \n\t" // save temp2 "vzeroupper \n\t" : : - "r" (from), // 0 - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (a[0]), // 4 - "r" (a[1]), // 5 - "r" (a[2]), // 6 - "r" (a[3]), // 8 - "r" (temp1), // 8 - "r" (temp2) // 9 + [from] "r" (from), // 0 + [to] "r" (to), // 1 + [x] "r" (x), // 2 + [y] "r" (y), // 3 + [a0] "r" (a[0]), // 4 + [a1] "r" (a[1]), // 5 + [a2] "r" (a[2]), // 6 + [a3] "r" (a[3]), // 7 + [temp1] "r" (temp1), // 8 + [temp2] "r" (temp2) // 9 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", From c202e06297f82ca0e592d2a02fa8d28eaef7a7ee Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 4 Aug 2018 23:35:36 +0000 Subject: [PATCH 06/19] Write dsymv_kernel_4x4 for Haswell using intrinsics intrinsics make the non-math part of the code easier to follow than all hand coded asm, and it also helps getting ready for adding avx512 support --- kernel/x86_64/dsymv_L_microk_haswell-2.c | 119 ++++++++--------------- 1 file changed, 41 insertions(+), 78 deletions(-) diff --git a/kernel/x86_64/dsymv_L_microk_haswell-2.c b/kernel/x86_64/dsymv_L_microk_haswell-2.c index 2f97bfcd9..0f559199e 100644 --- a/kernel/x86_64/dsymv_L_microk_haswell-2.c +++ b/kernel/x86_64/dsymv_L_microk_haswell-2.c @@ -25,105 +25,68 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#include + #define HAVE_KERNEL_4x4 1 -static void dsymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) { - __asm__ __volatile__ - ( - "vzeroupper \n\t" - "vxorpd %%ymm0 , %%ymm0 , %%ymm0 \n\t" // temp2[0] - "vxorpd %%ymm1 , %%ymm1 , %%ymm1 \n\t" // temp2[1] - "vxorpd %%ymm2 , %%ymm2 , %%ymm2 \n\t" // temp2[2] - "vxorpd %%ymm3 , %%ymm3 , %%ymm3 \n\t" // temp2[3] - "vbroadcastsd (%[temp1]), %%ymm4 \n\t" // temp1[0] - "vbroadcastsd 8(%[temp1]), %%ymm5 \n\t" // temp1[1] - "vbroadcastsd 16(%[temp1]), %%ymm6 \n\t" // temp1[1] - "vbroadcastsd 24(%[temp1]), %%ymm7 \n\t" // temp1[1] + __m256d temp2_0, temp2_1, temp2_2, temp2_3; // temp2_0 temp2_1 temp2_2 temp2_3 + __m256d temp1_0, temp1_1, temp1_2, temp1_3; - ".p2align 4 \n\t" - "1: \n\t" + temp2_0 = _mm256_setzero_pd(); + temp2_1 = _mm256_setzero_pd(); + temp2_2 = _mm256_setzero_pd(); + temp2_3 = _mm256_setzero_pd(); - "vmovups (%[y],%[from],8), %%ymm9 \n\t" // 2 * y - "vmovups (%[x],%[from],8), %%ymm8 \n\t" // 2 * x + temp1_0 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[0])); + temp1_1 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[1])); + temp1_2 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[2])); + temp1_3 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[3])); - "vmovups (%[a0],%[from],8), %%ymm12 \n\t" // 2 * a - "vmovups (%[a1],%[from],8), %%ymm13 \n\t" // 2 * a - "vmovups (%[a2],%[from],8), %%ymm14 \n\t" // 2 * a - "vmovups (%[a3],%[from],8), %%ymm15 \n\t" // 2 * a + for (; from != to; from += 4) { + __m256d _x, _y; + __m256d a0, a1, a2, a3; - "vfmadd231pd %%ymm4, %%ymm12 , %%ymm9 \n\t" // y += temp1 * a - "vfmadd231pd %%ymm8, %%ymm12 , %%ymm0 \n\t" // temp2 += x * a + _y = _mm256_loadu_pd(&y[from]); + _x = _mm256_loadu_pd(&x[from]); - "vfmadd231pd %%ymm5, %%ymm13 , %%ymm9 \n\t" // y += temp1 * a - "vfmadd231pd %%ymm8, %%ymm13 , %%ymm1 \n\t" // temp2 += x * a + a0 = _mm256_loadu_pd(&a[0][from]); + a1 = _mm256_loadu_pd(&a[1][from]); + a2 = _mm256_loadu_pd(&a[2][from]); + a3 = _mm256_loadu_pd(&a[3][from]); - "vfmadd231pd %%ymm6, %%ymm14 , %%ymm9 \n\t" // y += temp1 * a - "vfmadd231pd %%ymm8, %%ymm14 , %%ymm2 \n\t" // temp2 += x * a + _y += temp1_0 * a0 + temp1_1 * a1 + temp1_2 * a2 + temp1_3 * a3; - "vfmadd231pd %%ymm7, %%ymm15 , %%ymm9 \n\t" // y += temp1 * a - "vfmadd231pd %%ymm8, %%ymm15 , %%ymm3 \n\t" // temp2 += x * a - "addq $4 , %[from] \n\t" + temp2_0 += _x * a0; + temp2_1 += _x * a1; + temp2_2 += _x * a2; + temp2_3 += _x * a3; - "vmovups %%ymm9 , -32(%[y],%[from],8) \n\t" + _mm256_storeu_pd(&y[from], _y); - "cmpq %[from] , %[to] \n\t" - "jnz 1b \n\t" + }; - "vmovsd (%[temp2]), %%xmm4 \n\t" - "vmovsd 8(%[temp2]), %%xmm5 \n\t" - "vmovsd 16(%[temp2]), %%xmm6 \n\t" - "vmovsd 24(%[temp2]), %%xmm7 \n\t" + __m128d xmm0, xmm1, xmm2, xmm3; - "vextractf128 $0x01, %%ymm0 , %%xmm12 \n\t" - "vextractf128 $0x01, %%ymm1 , %%xmm13 \n\t" - "vextractf128 $0x01, %%ymm2 , %%xmm14 \n\t" - "vextractf128 $0x01, %%ymm3 , %%xmm15 \n\t" - "vaddpd %%xmm0, %%xmm12, %%xmm0 \n\t" - "vaddpd %%xmm1, %%xmm13, %%xmm1 \n\t" - "vaddpd %%xmm2, %%xmm14, %%xmm2 \n\t" - "vaddpd %%xmm3, %%xmm15, %%xmm3 \n\t" + xmm0 = _mm_add_pd(_mm256_extractf128_pd(temp2_0, 0), _mm256_extractf128_pd(temp2_0, 1)); + xmm1 = _mm_add_pd(_mm256_extractf128_pd(temp2_1, 0), _mm256_extractf128_pd(temp2_1, 1)); + xmm2 = _mm_add_pd(_mm256_extractf128_pd(temp2_2, 0), _mm256_extractf128_pd(temp2_2, 1)); + xmm3 = _mm_add_pd(_mm256_extractf128_pd(temp2_3, 0), _mm256_extractf128_pd(temp2_3, 1)); - "vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t" - "vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t" - "vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t" - "vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t" + xmm0 = _mm_hadd_pd(xmm0, xmm0); + xmm1 = _mm_hadd_pd(xmm1, xmm1); + xmm2 = _mm_hadd_pd(xmm2, xmm2); + xmm3 = _mm_hadd_pd(xmm3, xmm3); - "vaddsd %%xmm4, %%xmm0, %%xmm0 \n\t" - "vaddsd %%xmm5, %%xmm1, %%xmm1 \n\t" - "vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t" - "vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t" - - "vmovsd %%xmm0 , (%[temp2]) \n\t" // save temp2 - "vmovsd %%xmm1 , 8(%[temp2]) \n\t" // save temp2 - "vmovsd %%xmm2 ,16(%[temp2]) \n\t" // save temp2 - "vmovsd %%xmm3 ,24(%[temp2]) \n\t" // save temp2 - "vzeroupper \n\t" - - : - : - [from] "r" (from), // 0 - [to] "r" (to), // 1 - [x] "r" (x), // 2 - [y] "r" (y), // 3 - [a0] "r" (a[0]), // 4 - [a1] "r" (a[1]), // 5 - [a2] "r" (a[2]), // 6 - [a3] "r" (a[3]), // 7 - [temp1] "r" (temp1), // 8 - [temp2] "r" (temp2) // 9 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); + temp2[0] += xmm0[0]; + temp2[1] += xmm1[0]; + temp2[2] += xmm2[0]; + temp2[3] += xmm3[0]; } From f2810beafb168229ebb091b9f7513afb11f886f4 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 4 Aug 2018 23:56:06 +0000 Subject: [PATCH 07/19] Add AVX512 support to dsymv_L_microk_haswell-2.c Now that the code is written in intrinsics it's relatively easy to add AVX512 support --- kernel/x86_64/dsymv_L_microk_haswell-2.c | 50 ++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/kernel/x86_64/dsymv_L_microk_haswell-2.c b/kernel/x86_64/dsymv_L_microk_haswell-2.c index 0f559199e..5391b3dd0 100644 --- a/kernel/x86_64/dsymv_L_microk_haswell-2.c +++ b/kernel/x86_64/dsymv_L_microk_haswell-2.c @@ -46,6 +46,56 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL temp1_2 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[2])); temp1_3 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[3])); +#ifdef __AVX512CD__ + __m512d temp2_05, temp2_15, temp2_25, temp2_35; // temp2_0 temp2_1 temp2_2 temp2_3 + __m512d temp1_05, temp1_15, temp1_25, temp1_35; + BLASLONG to2; + int delta; + + temp2_05 = _mm512_setzero_pd(); + temp2_15 = _mm512_setzero_pd(); + temp2_25 = _mm512_setzero_pd(); + temp2_35 = _mm512_setzero_pd(); + + temp1_05 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[0])); + temp1_15 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[1])); + temp1_25 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[2])); + temp1_35 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[3])); + + delta = (to - from) & ~7; + to2 = from + delta; + + + for (; from < to2; from += 8) { + __m512d _x, _y; + __m512d a0, a1, a2, a3; + + _y = _mm512_loadu_pd(&y[from]); + _x = _mm512_loadu_pd(&x[from]); + + a0 = _mm512_loadu_pd(&a[0][from]); + a1 = _mm512_loadu_pd(&a[1][from]); + a2 = _mm512_loadu_pd(&a[2][from]); + a3 = _mm512_loadu_pd(&a[3][from]); + + _y += temp1_05 * a0 + temp1_15 * a1 + temp1_25 * a2 + temp1_35 * a3; + + temp2_05 += _x * a0; + temp2_15 += _x * a1; + temp2_25 += _x * a2; + temp2_35 += _x * a3; + + _mm512_storeu_pd(&y[from], _y); + + }; + + temp2_0 = _mm256_add_pd(_mm512_extractf64x4_pd(temp2_05, 0), _mm512_extractf64x4_pd(temp2_05, 1)); + temp2_1 = _mm256_add_pd(_mm512_extractf64x4_pd(temp2_15, 0), _mm512_extractf64x4_pd(temp2_15, 1)); + temp2_2 = _mm256_add_pd(_mm512_extractf64x4_pd(temp2_25, 0), _mm512_extractf64x4_pd(temp2_25, 1)); + temp2_3 = _mm256_add_pd(_mm512_extractf64x4_pd(temp2_35, 0), _mm512_extractf64x4_pd(temp2_35, 1)); + +#endif + for (; from != to; from += 4) { __m256d _x, _y; __m256d a0, a1, a2, a3; From 9c29524f50c8cd9db81a75c3c557caea01075e92 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Aug 2018 02:44:40 +0000 Subject: [PATCH 08/19] various code cleanups and comments --- kernel/x86_64/dgemv_n_microk_haswell-4.c | 37 ++++----- kernel/x86_64/dsymv_L_microk_haswell-2.c | 96 +++++++++++++++--------- 2 files changed, 72 insertions(+), 61 deletions(-) diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c index 26c3d1ae8..69e93a1fb 100644 --- a/kernel/x86_64/dgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c @@ -25,7 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -/* Ensure that the compiler knows how to generate AVX2 instructions if it doesn't already*/ +/* Ensure that the compiler knows how to generate AVX2 instructions if it doesn't already */ #ifndef __AVX512CD_ #pragma GCC target("avx2,fma") #endif @@ -68,17 +68,13 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT __m512d tempY; __m512d sum; - sum = _mm512_add_pd( - _mm512_add_pd( - _mm512_mul_pd(_mm512_loadu_pd(&ap[0][i]), x05), - _mm512_mul_pd(_mm512_loadu_pd(&ap[1][i]), x15)), - _mm512_add_pd( - _mm512_mul_pd(_mm512_loadu_pd(&ap[2][i]), x25), - _mm512_mul_pd(_mm512_loadu_pd(&ap[3][i]), x35)) - ); + sum = _mm512_loadu_pd(&ap[0][i]) * x05 + + _mm512_loadu_pd(&ap[1][i]) * x15 + + _mm512_loadu_pd(&ap[2][i]) * x25 + + _mm512_loadu_pd(&ap[3][i]) * x35; tempY = _mm512_loadu_pd(&y[i]); - tempY = _mm512_add_pd(tempY, _mm512_mul_pd(sum, __alpha5)); + tempY += sum * __alpha5; _mm512_storeu_pd(&y[i], tempY); } #endif @@ -87,17 +83,13 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT __m256d tempY; __m256d sum; - sum = _mm256_add_pd( - _mm256_add_pd( - _mm256_mul_pd(_mm256_loadu_pd(&ap[0][i]), x0), - _mm256_mul_pd(_mm256_loadu_pd(&ap[1][i]), x1)), - _mm256_add_pd( - _mm256_mul_pd(_mm256_loadu_pd(&ap[2][i]), x2), - _mm256_mul_pd(_mm256_loadu_pd(&ap[3][i]), x3)) - ); + sum = _mm256_loadu_pd(&ap[0][i]) * x0 + + _mm256_loadu_pd(&ap[1][i]) * x1 + + _mm256_loadu_pd(&ap[2][i]) * x2 + + _mm256_loadu_pd(&ap[3][i]) * x3; tempY = _mm256_loadu_pd(&y[i]); - tempY = _mm256_add_pd(tempY, _mm256_mul_pd(sum, __alpha)); + tempY += sum * __alpha; _mm256_storeu_pd(&y[i], tempY); } @@ -124,13 +116,10 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT __m256d tempY; __m256d sum; - sum = _mm256_add_pd( - _mm256_mul_pd(_mm256_loadu_pd(&ap[0][i]), x0), - _mm256_mul_pd(_mm256_loadu_pd(&ap[1][i]), x1) - ); + sum = _mm256_loadu_pd(&ap[0][i]) * x0 + _mm256_loadu_pd(&ap[1][i]) * x1; tempY = _mm256_loadu_pd(&y[i]); - tempY = _mm256_add_pd(tempY, _mm256_mul_pd(sum, __alpha)); + tempY += sum * __alpha; _mm256_storeu_pd(&y[i], tempY); } diff --git a/kernel/x86_64/dsymv_L_microk_haswell-2.c b/kernel/x86_64/dsymv_L_microk_haswell-2.c index 5391b3dd0..c87a64511 100644 --- a/kernel/x86_64/dsymv_L_microk_haswell-2.c +++ b/kernel/x86_64/dsymv_L_microk_haswell-2.c @@ -25,6 +25,14 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ + +/* Ensure that the compiler knows how to generate AVX2 instructions if it doesn't already */ +#ifndef __AVX512CD_ +#pragma GCC target("avx2,fma") +#endif + +#ifdef __AVX2__ + #include #define HAVE_KERNEL_4x4 1 @@ -33,13 +41,14 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL { - __m256d temp2_0, temp2_1, temp2_2, temp2_3; // temp2_0 temp2_1 temp2_2 temp2_3 + __m256d accum_0, accum_1, accum_2, accum_3; __m256d temp1_0, temp1_1, temp1_2, temp1_3; - temp2_0 = _mm256_setzero_pd(); - temp2_1 = _mm256_setzero_pd(); - temp2_2 = _mm256_setzero_pd(); - temp2_3 = _mm256_setzero_pd(); + /* the 256 bit wide acculmulator vectors start out as zero */ + accum_0 = _mm256_setzero_pd(); + accum_1 = _mm256_setzero_pd(); + accum_2 = _mm256_setzero_pd(); + accum_3 = _mm256_setzero_pd(); temp1_0 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[0])); temp1_1 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[1])); @@ -47,15 +56,16 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL temp1_3 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[3])); #ifdef __AVX512CD__ - __m512d temp2_05, temp2_15, temp2_25, temp2_35; // temp2_0 temp2_1 temp2_2 temp2_3 + __m512d accum_05, accum_15, accum_25, accum_35; __m512d temp1_05, temp1_15, temp1_25, temp1_35; BLASLONG to2; int delta; - temp2_05 = _mm512_setzero_pd(); - temp2_15 = _mm512_setzero_pd(); - temp2_25 = _mm512_setzero_pd(); - temp2_35 = _mm512_setzero_pd(); + /* the 512 bit wide accumulator vectors start out as zero */ + accum_05 = _mm512_setzero_pd(); + accum_15 = _mm512_setzero_pd(); + accum_25 = _mm512_setzero_pd(); + accum_35 = _mm512_setzero_pd(); temp1_05 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[0])); temp1_15 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[1])); @@ -80,19 +90,23 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL _y += temp1_05 * a0 + temp1_15 * a1 + temp1_25 * a2 + temp1_35 * a3; - temp2_05 += _x * a0; - temp2_15 += _x * a1; - temp2_25 += _x * a2; - temp2_35 += _x * a3; + accum_05 += _x * a0; + accum_15 += _x * a1; + accum_25 += _x * a2; + accum_35 += _x * a3; _mm512_storeu_pd(&y[from], _y); }; - temp2_0 = _mm256_add_pd(_mm512_extractf64x4_pd(temp2_05, 0), _mm512_extractf64x4_pd(temp2_05, 1)); - temp2_1 = _mm256_add_pd(_mm512_extractf64x4_pd(temp2_15, 0), _mm512_extractf64x4_pd(temp2_15, 1)); - temp2_2 = _mm256_add_pd(_mm512_extractf64x4_pd(temp2_25, 0), _mm512_extractf64x4_pd(temp2_25, 1)); - temp2_3 = _mm256_add_pd(_mm512_extractf64x4_pd(temp2_35, 0), _mm512_extractf64x4_pd(temp2_35, 1)); + /* + * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code + * below can continue using the intermediate results in its loop + */ + accum_0 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_05, 0), _mm512_extractf64x4_pd(accum_05, 1)); + accum_1 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_15, 0), _mm512_extractf64x4_pd(accum_15, 1)); + accum_2 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_25, 0), _mm512_extractf64x4_pd(accum_25, 1)); + accum_3 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_35, 0), _mm512_extractf64x4_pd(accum_35, 1)); #endif @@ -103,6 +117,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL _y = _mm256_loadu_pd(&y[from]); _x = _mm256_loadu_pd(&x[from]); + /* load 4 rows of matrix data */ a0 = _mm256_loadu_pd(&a[0][from]); a1 = _mm256_loadu_pd(&a[1][from]); a2 = _mm256_loadu_pd(&a[2][from]); @@ -110,33 +125,40 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL _y += temp1_0 * a0 + temp1_1 * a1 + temp1_2 * a2 + temp1_3 * a3; - temp2_0 += _x * a0; - temp2_1 += _x * a1; - temp2_2 += _x * a2; - temp2_3 += _x * a3; + accum_0 += _x * a0; + accum_1 += _x * a1; + accum_2 += _x * a2; + accum_3 += _x * a3; _mm256_storeu_pd(&y[from], _y); }; - __m128d xmm0, xmm1, xmm2, xmm3; + /* + * we now have 4 accumulator vectors. Each vector needs to be summed up element wise and stored in the temp2 + * output array. There is no direct instruction for this in 256 bit space, only in 128 space. + */ + + __m128d half_accum0, half_accum1, half_accum2, half_accum3; - xmm0 = _mm_add_pd(_mm256_extractf128_pd(temp2_0, 0), _mm256_extractf128_pd(temp2_0, 1)); - xmm1 = _mm_add_pd(_mm256_extractf128_pd(temp2_1, 0), _mm256_extractf128_pd(temp2_1, 1)); - xmm2 = _mm_add_pd(_mm256_extractf128_pd(temp2_2, 0), _mm256_extractf128_pd(temp2_2, 1)); - xmm3 = _mm_add_pd(_mm256_extractf128_pd(temp2_3, 0), _mm256_extractf128_pd(temp2_3, 1)); + /* Add upper half to lower half of each of the four 256 bit vectors to get to four 128 bit vectors */ + half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); + half_accum1 = _mm_add_pd(_mm256_extractf128_pd(accum_1, 0), _mm256_extractf128_pd(accum_1, 1)); + half_accum2 = _mm_add_pd(_mm256_extractf128_pd(accum_2, 0), _mm256_extractf128_pd(accum_2, 1)); + half_accum3 = _mm_add_pd(_mm256_extractf128_pd(accum_3, 0), _mm256_extractf128_pd(accum_3, 1)); - xmm0 = _mm_hadd_pd(xmm0, xmm0); - xmm1 = _mm_hadd_pd(xmm1, xmm1); - xmm2 = _mm_hadd_pd(xmm2, xmm2); - xmm3 = _mm_hadd_pd(xmm3, xmm3); + /* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */ + half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); + half_accum1 = _mm_hadd_pd(half_accum1, half_accum1); + half_accum2 = _mm_hadd_pd(half_accum2, half_accum2); + half_accum3 = _mm_hadd_pd(half_accum3, half_accum3); - - temp2[0] += xmm0[0]; - temp2[1] += xmm1[0]; - temp2[2] += xmm2[0]; - temp2[3] += xmm3[0]; + /* and store the lowest double value from each of these vectors in the temp2 output */ + temp2[0] += half_accum0[0]; + temp2[1] += half_accum1[0]; + temp2[2] += half_accum2[0]; + temp2[3] += half_accum3[0]; } - +#endif \ No newline at end of file From 847bbd6f4ceaba98ab94c6e7910516e896e80024 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Aug 2018 14:22:38 +0000 Subject: [PATCH 09/19] use named arguments in the inline asm makes the asm easier to read --- kernel/x86_64/ddot_microk_haswell-2.c | 32 +++++++++++++-------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c index 365737363..e14f50370 100644 --- a/kernel/x86_64/ddot_microk_haswell-2.c +++ b/kernel/x86_64/ddot_microk_haswell-2.c @@ -43,18 +43,18 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) ".p2align 4 \n\t" "1: \n\t" - "vmovups (%2,%0,8), %%ymm12 \n\t" // 2 * x - "vmovups 32(%2,%0,8), %%ymm13 \n\t" // 2 * x - "vmovups 64(%2,%0,8), %%ymm14 \n\t" // 2 * x - "vmovups 96(%2,%0,8), %%ymm15 \n\t" // 2 * x + "vmovups (%[x],%[i],8), %%ymm12 \n\t" // 2 * x + "vmovups 32(%[x],%[i],8), %%ymm13 \n\t" // 2 * x + "vmovups 64(%[x],%[i],8), %%ymm14 \n\t" // 2 * x + "vmovups 96(%[x],%[i],8), %%ymm15 \n\t" // 2 * x - "vfmadd231pd (%3,%0,8), %%ymm12, %%ymm4 \n\t" // 2 * y - "vfmadd231pd 32(%3,%0,8), %%ymm13, %%ymm5 \n\t" // 2 * y - "vfmadd231pd 64(%3,%0,8), %%ymm14, %%ymm6 \n\t" // 2 * y - "vfmadd231pd 96(%3,%0,8), %%ymm15, %%ymm7 \n\t" // 2 * y + "vfmadd231pd (%[y],%[i],8), %%ymm12, %%ymm4 \n\t" // 2 * y + "vfmadd231pd 32(%[y],%[i],8), %%ymm13, %%ymm5 \n\t" // 2 * y + "vfmadd231pd 64(%[y],%[i],8), %%ymm14, %%ymm6 \n\t" // 2 * y + "vfmadd231pd 96(%[y],%[i],8), %%ymm15, %%ymm7 \n\t" // 2 * y - "addq $16 , %0 \n\t" - "subq $16 , %1 \n\t" + "addq $16 , %[i] \n\t" + "subq $16 , %[n] \n\t" "jnz 1b \n\t" "vextractf128 $1 , %%ymm4 , %%xmm12 \n\t" @@ -73,16 +73,16 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" - "vmovsd %%xmm4, (%4) \n\t" + "vmovsd %%xmm4, (%[dot]) \n\t" "vzeroupper \n\t" : : - "r" (i), // 0 - "r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 + [i] "r" (i), // 0 + [n] "r" (n), // 1 + [x] "r" (x), // 2 + [y] "r" (y), // 3 + [dot] "r" (dot) // 4 : "cc", "%xmm4", "%xmm5", "%xmm6", "%xmm7", From ae38fa55c36142144454ac0e2f501fffecd9468c Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Aug 2018 14:45:54 +0000 Subject: [PATCH 10/19] Use intrinsics instead of inline asm Intrinsics based code is generally easier to read for the non-math part of the algorithm and it's easier to add, say, AVX512 to it later --- kernel/x86_64/ddot_microk_haswell-2.c | 87 ++++++++++----------------- 1 file changed, 32 insertions(+), 55 deletions(-) diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c index e14f50370..8feeb79a1 100644 --- a/kernel/x86_64/ddot_microk_haswell-2.c +++ b/kernel/x86_64/ddot_microk_haswell-2.c @@ -25,71 +25,48 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +/* Ensure that the compiler knows how to generate AVX2 instructions if it doesn't already */ +#ifndef __AVX512CD_ +#pragma GCC target("avx2,fma") +#endif + +#ifdef __AVX2__ + #define HAVE_KERNEL_8 1 + +#include static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) { + int i = 0; + __m256d accum_0, accum_1, accum_2, accum_3; + + accum_0 = _mm256_setzero_pd(); + accum_1 = _mm256_setzero_pd(); + accum_2 = _mm256_setzero_pd(); + accum_3 = _mm256_setzero_pd(); + for (; i < n; i += 16) { + accum_0 += _mm256_loadu_pd(&x[i+ 0]) * _mm256_loadu_pd(&y[i+0]); + accum_1 += _mm256_loadu_pd(&x[i+ 4]) * _mm256_loadu_pd(&y[i+4]); + accum_2 += _mm256_loadu_pd(&x[i+ 8]) * _mm256_loadu_pd(&y[i+8]); + accum_3 += _mm256_loadu_pd(&x[i+12]) * _mm256_loadu_pd(&y[i+12]); + } - BLASLONG register i = 0; + /* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */ - __asm__ __volatile__ - ( - "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" - "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" - "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; - ".p2align 4 \n\t" - "1: \n\t" - "vmovups (%[x],%[i],8), %%ymm12 \n\t" // 2 * x - "vmovups 32(%[x],%[i],8), %%ymm13 \n\t" // 2 * x - "vmovups 64(%[x],%[i],8), %%ymm14 \n\t" // 2 * x - "vmovups 96(%[x],%[i],8), %%ymm15 \n\t" // 2 * x + __m128d half_accum0; - "vfmadd231pd (%[y],%[i],8), %%ymm12, %%ymm4 \n\t" // 2 * y - "vfmadd231pd 32(%[y],%[i],8), %%ymm13, %%ymm5 \n\t" // 2 * y - "vfmadd231pd 64(%[y],%[i],8), %%ymm14, %%ymm6 \n\t" // 2 * y - "vfmadd231pd 96(%[y],%[i],8), %%ymm15, %%ymm7 \n\t" // 2 * y + /* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */ + half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); - "addq $16 , %[i] \n\t" - "subq $16 , %[n] \n\t" - "jnz 1b \n\t" - - "vextractf128 $1 , %%ymm4 , %%xmm12 \n\t" - "vextractf128 $1 , %%ymm5 , %%xmm13 \n\t" - "vextractf128 $1 , %%ymm6 , %%xmm14 \n\t" - "vextractf128 $1 , %%ymm7 , %%xmm15 \n\t" - - "vaddpd %%xmm4, %%xmm12, %%xmm4 \n\t" - "vaddpd %%xmm5, %%xmm13, %%xmm5 \n\t" - "vaddpd %%xmm6, %%xmm14, %%xmm6 \n\t" - "vaddpd %%xmm7, %%xmm15, %%xmm7 \n\t" - - "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" - "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" - "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" - - "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" - - "vmovsd %%xmm4, (%[dot]) \n\t" - "vzeroupper \n\t" - - : - : - [i] "r" (i), // 0 - [n] "r" (n), // 1 - [x] "r" (x), // 2 - [y] "r" (y), // 3 - [dot] "r" (dot) // 4 - : "cc", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} + /* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */ + half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); + *dot = half_accum0[0]; +} +#endif From 34d63df4b3ec7ab6ae1fe54a8b722268e87dc186 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Aug 2018 15:16:20 +0000 Subject: [PATCH 11/19] Add AVX512 support to DDOT now that it's written in C + intrinsics it's easy to add AVX512 support for DDOT --- kernel/x86_64/ddot_microk_haswell-2.c | 37 +++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c index 8feeb79a1..b9be83af6 100644 --- a/kernel/x86_64/ddot_microk_haswell-2.c +++ b/kernel/x86_64/ddot_microk_haswell-2.c @@ -26,7 +26,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* Ensure that the compiler knows how to generate AVX2 instructions if it doesn't already */ -#ifndef __AVX512CD_ + +#ifndef __AVX512CD__ #pragma GCC target("avx2,fma") #endif @@ -35,7 +36,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define HAVE_KERNEL_8 1 #include -static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) { @@ -47,10 +47,37 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) accum_2 = _mm256_setzero_pd(); accum_3 = _mm256_setzero_pd(); +#ifdef __AVX512CD__ + __m512d accum_05, accum_15, accum_25, accum_35; + int n32; + n32 = n & (~31); + + accum_05 = _mm512_setzero_pd(); + accum_15 = _mm512_setzero_pd(); + accum_25 = _mm512_setzero_pd(); + accum_35 = _mm512_setzero_pd(); + + for (; i < n32; i += 32) { + accum_05 += _mm512_loadu_pd(&x[i+ 0]) * _mm512_loadu_pd(&y[i+ 0]); + accum_15 += _mm512_loadu_pd(&x[i+ 8]) * _mm512_loadu_pd(&y[i+ 8]); + accum_25 += _mm512_loadu_pd(&x[i+16]) * _mm512_loadu_pd(&y[i+16]); + accum_35 += _mm512_loadu_pd(&x[i+24]) * _mm512_loadu_pd(&y[i+24]); + } + + /* + * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code + * below can continue using the intermediate results in its loop + */ + accum_0 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_05, 0), _mm512_extractf64x4_pd(accum_05, 1)); + accum_1 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_15, 0), _mm512_extractf64x4_pd(accum_15, 1)); + accum_2 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_25, 0), _mm512_extractf64x4_pd(accum_25, 1)); + accum_3 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_35, 0), _mm512_extractf64x4_pd(accum_35, 1)); + +#endif for (; i < n; i += 16) { - accum_0 += _mm256_loadu_pd(&x[i+ 0]) * _mm256_loadu_pd(&y[i+0]); - accum_1 += _mm256_loadu_pd(&x[i+ 4]) * _mm256_loadu_pd(&y[i+4]); - accum_2 += _mm256_loadu_pd(&x[i+ 8]) * _mm256_loadu_pd(&y[i+8]); + accum_0 += _mm256_loadu_pd(&x[i+ 0]) * _mm256_loadu_pd(&y[i+ 0]); + accum_1 += _mm256_loadu_pd(&x[i+ 4]) * _mm256_loadu_pd(&y[i+ 4]); + accum_2 += _mm256_loadu_pd(&x[i+ 8]) * _mm256_loadu_pd(&y[i+ 8]); accum_3 += _mm256_loadu_pd(&x[i+12]) * _mm256_loadu_pd(&y[i+12]); } From 21c6220d63a0ad58965e3593c088ac8b9b3a7f3a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Aug 2018 15:16:48 +0000 Subject: [PATCH 12/19] fix typo in dsymv avx512 code path --- kernel/x86_64/dsymv_L_microk_haswell-2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/dsymv_L_microk_haswell-2.c b/kernel/x86_64/dsymv_L_microk_haswell-2.c index c87a64511..268692091 100644 --- a/kernel/x86_64/dsymv_L_microk_haswell-2.c +++ b/kernel/x86_64/dsymv_L_microk_haswell-2.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* Ensure that the compiler knows how to generate AVX2 instructions if it doesn't already */ -#ifndef __AVX512CD_ +#ifndef __AVX512CD__ #pragma GCC target("avx2,fma") #endif From ef30a7239cc24518084834ed28aeea21ca7463a4 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Aug 2018 16:38:19 +0000 Subject: [PATCH 13/19] sdot_haswell: similar to ddot: turn into intrinsics based C code that supports AVX512 do the same thing for SDOT that the previous patches did for DDOT; the perf gain is in the 60% range so at least somewhat interesting --- kernel/x86_64/sdot_microk_haswell-2.c | 123 +++++++++++++------------- 1 file changed, 62 insertions(+), 61 deletions(-) diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c index df367b61f..070171792 100644 --- a/kernel/x86_64/sdot_microk_haswell-2.c +++ b/kernel/x86_64/sdot_microk_haswell-2.c @@ -25,74 +25,75 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#define HAVE_KERNEL_16 1 -static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); - -static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) -{ - - - BLASLONG register i = 0; - - __asm__ __volatile__ - ( - "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" - "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" - "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" - "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" - - ".p2align 4 \n\t" - "1: \n\t" - "vmovups (%2,%0,4), %%ymm12 \n\t" // 2 * x - "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 2 * x - "vmovups 64(%2,%0,4), %%ymm14 \n\t" // 2 * x - "vmovups 96(%2,%0,4), %%ymm15 \n\t" // 2 * x - - "vfmadd231ps (%3,%0,4), %%ymm12, %%ymm4 \n\t" // 2 * y - "vfmadd231ps 32(%3,%0,4), %%ymm13, %%ymm5 \n\t" // 2 * y - "vfmadd231ps 64(%3,%0,4), %%ymm14, %%ymm6 \n\t" // 2 * y - "vfmadd231ps 96(%3,%0,4), %%ymm15, %%ymm7 \n\t" // 2 * y - -#ifndef DSDOT - "addq $32 , %0 \n\t" - "subq $32 , %1 \n\t" - "jnz 1b \n\t" +#ifndef __AVX512CD__ +#pragma GCC target("avx2,fma") #endif - "vextractf128 $1 , %%ymm4 , %%xmm12 \n\t" - "vextractf128 $1 , %%ymm5 , %%xmm13 \n\t" - "vextractf128 $1 , %%ymm6 , %%xmm14 \n\t" - "vextractf128 $1 , %%ymm7 , %%xmm15 \n\t" +#ifdef __AVX2__ - "vaddps %%xmm4, %%xmm12, %%xmm4 \n\t" - "vaddps %%xmm5, %%xmm13, %%xmm5 \n\t" - "vaddps %%xmm6, %%xmm14, %%xmm6 \n\t" - "vaddps %%xmm7, %%xmm15, %%xmm7 \n\t" +#define HAVE_KERNEL_16 1 - "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" - "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" - "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" +#include - "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" - "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovss %%xmm4, (%4) \n\t" - "vzeroupper \n\t" +{ + int i = 0; + __m256 accum_0, accum_1, accum_2, accum_3; - : - : - "r" (i), // 0 - "r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - : "cc", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); + accum_0 = _mm256_setzero_ps(); + accum_1 = _mm256_setzero_ps(); + accum_2 = _mm256_setzero_ps(); + accum_3 = _mm256_setzero_ps(); -} +#ifdef __AVX512CD__ + __m512 accum_05, accum_15, accum_25, accum_35; + int n64; + n64 = n & (~63); + accum_05 = _mm512_setzero_ps(); + accum_15 = _mm512_setzero_ps(); + accum_25 = _mm512_setzero_ps(); + accum_35 = _mm512_setzero_ps(); + for (; i < n64; i += 64) { + accum_05 += _mm512_loadu_ps(&x[i+ 0]) * _mm512_loadu_ps(&y[i+ 0]); + accum_15 += _mm512_loadu_ps(&x[i+16]) * _mm512_loadu_ps(&y[i+16]); + accum_25 += _mm512_loadu_ps(&x[i+32]) * _mm512_loadu_ps(&y[i+32]); + accum_35 += _mm512_loadu_ps(&x[i+48]) * _mm512_loadu_ps(&y[i+48]); + } + + /* + * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code + * below can continue using the intermediate results in its loop + */ + accum_0 = _mm256_add_ps(_mm512_extractf32x8_ps(accum_05, 0), _mm512_extractf32x8_ps(accum_05, 1)); + accum_1 = _mm256_add_ps(_mm512_extractf32x8_ps(accum_15, 0), _mm512_extractf32x8_ps(accum_15, 1)); + accum_2 = _mm256_add_ps(_mm512_extractf32x8_ps(accum_25, 0), _mm512_extractf32x8_ps(accum_25, 1)); + accum_3 = _mm256_add_ps(_mm512_extractf32x8_ps(accum_35, 0), _mm512_extractf32x8_ps(accum_35, 1)); + +#endif + for (; i < n; i += 32) { + accum_0 += _mm256_loadu_ps(&x[i+ 0]) * _mm256_loadu_ps(&y[i+ 0]); + accum_1 += _mm256_loadu_ps(&x[i+ 8]) * _mm256_loadu_ps(&y[i+ 8]); + accum_2 += _mm256_loadu_ps(&x[i+16]) * _mm256_loadu_ps(&y[i+16]); + accum_3 += _mm256_loadu_ps(&x[i+24]) * _mm256_loadu_ps(&y[i+24]); + } + + /* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */ + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + + __m128 half_accum0; + + /* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */ + half_accum0 = _mm_add_ps(_mm256_extractf128_ps(accum_0, 0), _mm256_extractf128_ps(accum_0, 1)); + + /* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */ + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + + *dot = half_accum0[0]; +} + +#endif From d86604687f6610ca7fb23e92bae8f549f83e7db3 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Aug 2018 17:16:14 +0000 Subject: [PATCH 14/19] saxpy_haswell: Use named arguments in inline asm Improves readability --- kernel/x86_64/saxpy_microk_haswell-2.c | 44 +++++++++++++------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c index 3a743d64c..36d1cd882 100644 --- a/kernel/x86_64/saxpy_microk_haswell-2.c +++ b/kernel/x86_64/saxpy_microk_haswell-2.c @@ -36,36 +36,36 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) __asm__ __volatile__ ( - "vbroadcastss (%4), %%ymm0 \n\t" // alpha + "vbroadcastss (%[alpha]), %%ymm0 \n\t" // alpha - ".p2align 4 \n\t" - "1: \n\t" + ".p2align 4 \n\t" + "1: \n\t" - "vmovups (%3,%0,4), %%ymm12 \n\t" // 8 * y - "vmovups 32(%3,%0,4), %%ymm13 \n\t" // 8 * y - "vmovups 64(%3,%0,4), %%ymm14 \n\t" // 8 * y - "vmovups 96(%3,%0,4), %%ymm15 \n\t" // 8 * y - "vfmadd231ps (%2,%0,4), %%ymm0 , %%ymm12 \n\t" // y += alpha * x - "vfmadd231ps 32(%2,%0,4), %%ymm0 , %%ymm13 \n\t" // y += alpha * x - "vfmadd231ps 64(%2,%0,4), %%ymm0 , %%ymm14 \n\t" // y += alpha * x - "vfmadd231ps 96(%2,%0,4), %%ymm0 , %%ymm15 \n\t" // y += alpha * x - "vmovups %%ymm12, (%3,%0,4) \n\t" - "vmovups %%ymm13, 32(%3,%0,4) \n\t" - "vmovups %%ymm14, 64(%3,%0,4) \n\t" - "vmovups %%ymm15, 96(%3,%0,4) \n\t" + "vmovups (%[y],%[i],4), %%ymm12 \n\t" // 8 * y + "vmovups 32(%[y],%[i],4), %%ymm13 \n\t" // 8 * y + "vmovups 64(%[y],%[i],4), %%ymm14 \n\t" // 8 * y + "vmovups 96(%[y],%[i],4), %%ymm15 \n\t" // 8 * y + "vfmadd231ps (%[x],%[i],4), %%ymm0 , %%ymm12 \n\t" // y += alpha * x + "vfmadd231ps 32(%[x],%[i],4), %%ymm0 , %%ymm13 \n\t" // y += alpha * x + "vfmadd231ps 64(%[x],%[i],4), %%ymm0 , %%ymm14 \n\t" // y += alpha * x + "vfmadd231ps 96(%[x],%[i],4), %%ymm0 , %%ymm15 \n\t" // y += alpha * x + "vmovups %%ymm12, (%[y],%[i],4) \n\t" + "vmovups %%ymm13, 32(%[y],%[i],4) \n\t" + "vmovups %%ymm14, 64(%[y],%[i],4) \n\t" + "vmovups %%ymm15, 96(%[y],%[i],4) \n\t" - "addq $32, %0 \n\t" - "subq $32, %1 \n\t" + "addq $32, %[i] \n\t" + "subq $32, %[n] \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : - "r" (i), // 0 - "r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 + [i] "r" (i), // 0 + [n] "r" (n), // 1 + [x] "r" (x), // 2 + [y] "r" (y), // 3 + [alpha] "r" (alpha) // 4 : "cc", "%xmm0", "%xmm8", "%xmm9", "%xmm10", "%xmm11", From 06ea72f5a5d758c43cac19b6c2782032f5125cb9 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Aug 2018 17:43:40 +0000 Subject: [PATCH 15/19] write saxpy_haswell kernel using C intrinsics and don't disallow inlining the intrinsics version of saxpy is more readable than the inline asm version, and in the intrinsics version there's no reason anymore to ban inlining (since the compiler has full visibility now) which gives a mid single digits improvement in performance --- kernel/x86_64/saxpy_microk_haswell-2.c | 69 +++++++++++--------------- 1 file changed, 29 insertions(+), 40 deletions(-) diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c index 36d1cd882..2ca8270b2 100644 --- a/kernel/x86_64/saxpy_microk_haswell-2.c +++ b/kernel/x86_64/saxpy_microk_haswell-2.c @@ -25,54 +25,43 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ + +#ifndef __AVX512CD__ +#pragma GCC target("avx2,fma") +#endif + +#ifdef __AVX2__ + #define HAVE_KERNEL_16 1 -static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +#include static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { + BLASLONG i = 0; + __m256 __alpha; - BLASLONG register i = 0; + __alpha = _mm256_broadcastss_ps(_mm_load_ss(alpha)); - __asm__ __volatile__ - ( - "vbroadcastss (%[alpha]), %%ymm0 \n\t" // alpha + for (; i < n; i+= 32) { + __m256 y0, y8, y16, y24; - ".p2align 4 \n\t" - "1: \n\t" + y0 = _mm256_loadu_ps(&y[i + 0]); + y8 = _mm256_loadu_ps(&y[i + 8]); + y16 = _mm256_loadu_ps(&y[i + 16]); + y24 = _mm256_loadu_ps(&y[i + 24]); - "vmovups (%[y],%[i],4), %%ymm12 \n\t" // 8 * y - "vmovups 32(%[y],%[i],4), %%ymm13 \n\t" // 8 * y - "vmovups 64(%[y],%[i],4), %%ymm14 \n\t" // 8 * y - "vmovups 96(%[y],%[i],4), %%ymm15 \n\t" // 8 * y - "vfmadd231ps (%[x],%[i],4), %%ymm0 , %%ymm12 \n\t" // y += alpha * x - "vfmadd231ps 32(%[x],%[i],4), %%ymm0 , %%ymm13 \n\t" // y += alpha * x - "vfmadd231ps 64(%[x],%[i],4), %%ymm0 , %%ymm14 \n\t" // y += alpha * x - "vfmadd231ps 96(%[x],%[i],4), %%ymm0 , %%ymm15 \n\t" // y += alpha * x - "vmovups %%ymm12, (%[y],%[i],4) \n\t" - "vmovups %%ymm13, 32(%[y],%[i],4) \n\t" - "vmovups %%ymm14, 64(%[y],%[i],4) \n\t" - "vmovups %%ymm15, 96(%[y],%[i],4) \n\t" - - "addq $32, %[i] \n\t" - "subq $32, %[n] \n\t" - "jnz 1b \n\t" - "vzeroupper \n\t" - - : - : - [i] "r" (i), // 0 - [n] "r" (n), // 1 - [x] "r" (x), // 2 - [y] "r" (y), // 3 - [alpha] "r" (alpha) // 4 - : "cc", - "%xmm0", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} + y0 += __alpha * _mm256_loadu_ps(&x[i + 0]); + y8 += __alpha * _mm256_loadu_ps(&x[i + 8]); + y16 += __alpha * _mm256_loadu_ps(&x[i + 16]); + y24 += __alpha * _mm256_loadu_ps(&x[i + 24]); + _mm256_storeu_ps(&y[i + 0], y0); + _mm256_storeu_ps(&y[i + 8], y8); + _mm256_storeu_ps(&y[i + 16], y16); + _mm256_storeu_ps(&y[i + 24], y24); + } +} +#endif From 850b73dbb98e8beeee1f71c9978287ac8b5fb9eb Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Aug 2018 17:50:16 +0000 Subject: [PATCH 16/19] saxpy_haswell: Add AVX512 support avx512 support fits nicely in the C+intrinsics code and gets a speed improvement for vectors where the saxpy operation is not fully memory bound --- kernel/x86_64/saxpy_microk_haswell-2.c | 28 ++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c index 2ca8270b2..bf5099b70 100644 --- a/kernel/x86_64/saxpy_microk_haswell-2.c +++ b/kernel/x86_64/saxpy_microk_haswell-2.c @@ -44,6 +44,34 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) __alpha = _mm256_broadcastss_ps(_mm_load_ss(alpha)); +#ifdef __AVX512CD__ + BLASLONG n64; + __m512 __alpha5; + __alpha5 = _mm512_broadcastss_ps(_mm_load_ss(alpha)); + + n64 = n & ~63; + + for (; i < n64; i+= 64) { + __m512 y0, y16, y32, y48; + + y0 = _mm512_loadu_ps(&y[i + 0]); + y16 = _mm512_loadu_ps(&y[i + 16]); + y32 = _mm512_loadu_ps(&y[i + 32]); + y48 = _mm512_loadu_ps(&y[i + 48]); + + y0 += __alpha5 * _mm512_loadu_ps(&x[i + 0]); + y16 += __alpha5 * _mm512_loadu_ps(&x[i + 16]); + y32 += __alpha5 * _mm512_loadu_ps(&x[i + 32]); + y48 += __alpha5 * _mm512_loadu_ps(&x[i + 48]); + + _mm512_storeu_ps(&y[i + 0], y0); + _mm512_storeu_ps(&y[i + 16], y16); + _mm512_storeu_ps(&y[i + 32], y32); + _mm512_storeu_ps(&y[i + 48], y48); + } + +#endif + for (; i < n; i+= 32) { __m256 y0, y8, y16, y24; From 7af8a5445d0cd3750521d57c7495c977e6f1dc2c Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Aug 2018 18:28:47 +0000 Subject: [PATCH 17/19] saxpy_haswell: Go to a more compact intrinsics notation --- kernel/x86_64/saxpy_microk_haswell-2.c | 40 ++++++-------------------- 1 file changed, 8 insertions(+), 32 deletions(-) diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c index bf5099b70..0f6587367 100644 --- a/kernel/x86_64/saxpy_microk_haswell-2.c +++ b/kernel/x86_64/saxpy_microk_haswell-2.c @@ -52,43 +52,19 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) n64 = n & ~63; for (; i < n64; i+= 64) { - __m512 y0, y16, y32, y48; - - y0 = _mm512_loadu_ps(&y[i + 0]); - y16 = _mm512_loadu_ps(&y[i + 16]); - y32 = _mm512_loadu_ps(&y[i + 32]); - y48 = _mm512_loadu_ps(&y[i + 48]); - - y0 += __alpha5 * _mm512_loadu_ps(&x[i + 0]); - y16 += __alpha5 * _mm512_loadu_ps(&x[i + 16]); - y32 += __alpha5 * _mm512_loadu_ps(&x[i + 32]); - y48 += __alpha5 * _mm512_loadu_ps(&x[i + 48]); - - _mm512_storeu_ps(&y[i + 0], y0); - _mm512_storeu_ps(&y[i + 16], y16); - _mm512_storeu_ps(&y[i + 32], y32); - _mm512_storeu_ps(&y[i + 48], y48); + _mm512_storeu_ps(&y[i + 0], _mm512_loadu_ps(&y[i + 0]) + __alpha5 * _mm512_loadu_ps(&x[i + 0])); + _mm512_storeu_ps(&y[i + 16], _mm512_loadu_ps(&y[i + 16]) + __alpha5 * _mm512_loadu_ps(&x[i + 16])); + _mm512_storeu_ps(&y[i + 32], _mm512_loadu_ps(&y[i + 32]) + __alpha5 * _mm512_loadu_ps(&x[i + 32])); + _mm512_storeu_ps(&y[i + 48], _mm512_loadu_ps(&y[i + 48]) + __alpha5 * _mm512_loadu_ps(&x[i + 48])); } #endif for (; i < n; i+= 32) { - __m256 y0, y8, y16, y24; - - y0 = _mm256_loadu_ps(&y[i + 0]); - y8 = _mm256_loadu_ps(&y[i + 8]); - y16 = _mm256_loadu_ps(&y[i + 16]); - y24 = _mm256_loadu_ps(&y[i + 24]); - - y0 += __alpha * _mm256_loadu_ps(&x[i + 0]); - y8 += __alpha * _mm256_loadu_ps(&x[i + 8]); - y16 += __alpha * _mm256_loadu_ps(&x[i + 16]); - y24 += __alpha * _mm256_loadu_ps(&x[i + 24]); - - _mm256_storeu_ps(&y[i + 0], y0); - _mm256_storeu_ps(&y[i + 8], y8); - _mm256_storeu_ps(&y[i + 16], y16); - _mm256_storeu_ps(&y[i + 24], y24); + _mm256_storeu_ps(&y[i + 0], _mm256_loadu_ps(&y[i + 0]) + __alpha * _mm256_loadu_ps(&x[i + 0])); + _mm256_storeu_ps(&y[i + 8], _mm256_loadu_ps(&y[i + 8]) + __alpha * _mm256_loadu_ps(&x[i + 8])); + _mm256_storeu_ps(&y[i + 16], _mm256_loadu_ps(&y[i + 16]) + __alpha * _mm256_loadu_ps(&x[i + 16])); + _mm256_storeu_ps(&y[i + 24], _mm256_loadu_ps(&y[i + 24]) + __alpha * _mm256_loadu_ps(&x[i + 24])); } } #endif From 93aa18b1a806d7f2749aace0b7c88831a4c576cc Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Aug 2018 18:29:34 +0000 Subject: [PATCH 18/19] daxpy_haswell: Change to C+instrinsics + AVX512 to mimic the change to saxpy_haswell Use the same transformation as was done to saxpy for daxpy gives a low double digit performance increase --- kernel/x86_64/daxpy_microk_haswell-2.c | 85 ++++++++++++-------------- 1 file changed, 40 insertions(+), 45 deletions(-) diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c index bbe8b9550..c2491ba9b 100644 --- a/kernel/x86_64/daxpy_microk_haswell-2.c +++ b/kernel/x86_64/daxpy_microk_haswell-2.c @@ -25,54 +25,49 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ + + +#ifndef __AVX512CD__ +#pragma GCC target("avx2,fma") +#endif + +#ifdef __AVX2__ + +#include + #define HAVE_KERNEL_8 1 -static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { - - - BLASLONG register i = 0; - - __asm__ __volatile__ - ( - "vbroadcastsd (%4), %%ymm0 \n\t" // alpha - - ".p2align 4 \n\t" - "1: \n\t" - - "vmovups (%3,%0,8), %%ymm12 \n\t" // 4 * y - "vmovups 32(%3,%0,8), %%ymm13 \n\t" // 4 * y - "vmovups 64(%3,%0,8), %%ymm14 \n\t" // 4 * y - "vmovups 96(%3,%0,8), %%ymm15 \n\t" // 4 * y - "vfmadd231pd (%2,%0,8), %%ymm0 , %%ymm12 \n\t" // y += alpha * x - "vfmadd231pd 32(%2,%0,8), %%ymm0 , %%ymm13 \n\t" // y += alpha * x - "vfmadd231pd 64(%2,%0,8), %%ymm0 , %%ymm14 \n\t" // y += alpha * x - "vfmadd231pd 96(%2,%0,8), %%ymm0 , %%ymm15 \n\t" // y += alpha * x - "vmovups %%ymm12, (%3,%0,8) \n\t" - "vmovups %%ymm13, 32(%3,%0,8) \n\t" - "vmovups %%ymm14, 64(%3,%0,8) \n\t" - "vmovups %%ymm15, 96(%3,%0,8) \n\t" - - "addq $16, %0 \n\t" - "subq $16, %1 \n\t" - "jnz 1b \n\t" - "vzeroupper \n\t" - - : - : - "r" (i), // 0 - "r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - : "cc", - "%xmm0", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} + BLASLONG i = 0; + + __m256d __alpha; + + __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); + +#ifdef __AVX512CD__ + BLASLONG n32; + __m512d __alpha5; + __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha)); + + n32 = n & ~31; + + for (; i < n32; i+= 32) { + _mm512_storeu_pd(&y[i + 0], _mm512_loadu_pd(&y[i + 0]) + __alpha5 * _mm512_loadu_pd(&x[i + 0])); + _mm512_storeu_pd(&y[i + 8], _mm512_loadu_pd(&y[i + 8]) + __alpha5 * _mm512_loadu_pd(&x[i + 8])); + _mm512_storeu_pd(&y[i + 16], _mm512_loadu_pd(&y[i + 16]) + __alpha5 * _mm512_loadu_pd(&x[i + 16])); + _mm512_storeu_pd(&y[i + 24], _mm512_loadu_pd(&y[i + 24]) + __alpha5 * _mm512_loadu_pd(&x[i + 24])); + } + +#endif + + for (; i < n; i+= 16) { + _mm256_storeu_pd(&y[i + 0], _mm256_loadu_pd(&y[i + 0]) + __alpha * _mm256_loadu_pd(&x[i + 0])); + _mm256_storeu_pd(&y[i + 4], _mm256_loadu_pd(&y[i + 4]) + __alpha * _mm256_loadu_pd(&x[i + 4])); + _mm256_storeu_pd(&y[i + 8], _mm256_loadu_pd(&y[i + 8]) + __alpha * _mm256_loadu_pd(&x[i + 8])); + _mm256_storeu_pd(&y[i + 12], _mm256_loadu_pd(&y[i + 12]) + __alpha * _mm256_loadu_pd(&x[i + 12])); + } +} +#endif From b1cc69e7a871b41b0d0c3c32af0fe0981deb1add Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Aug 2018 19:19:49 +0000 Subject: [PATCH 19/19] Convert dscal_haswell to intrinsics and add AVX512 support dscal is a relatively simple function... make it more readable and 50% faster by using C intrinsics and AVX512 support --- kernel/x86_64/dscal_microk_haswell-2.c | 201 +++++-------------------- 1 file changed, 37 insertions(+), 164 deletions(-) diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c index e732a2718..20fb2b283 100644 --- a/kernel/x86_64/dscal_microk_haswell-2.c +++ b/kernel/x86_64/dscal_microk_haswell-2.c @@ -25,182 +25,55 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#define HAVE_KERNEL_8 1 -static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); +#ifndef __AVX512CD__ +#pragma GCC target("avx2,fma") +#endif + +#ifdef __AVX2__ + +#include + +#define HAVE_KERNEL_8 1 static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) { + int i = 0; - - BLASLONG n1 = n >> 4 ; - BLASLONG n2 = n & 8 ; - - __asm__ __volatile__ - ( - "vmovddup (%2), %%xmm0 \n\t" // alpha - - "addq $128, %1 \n\t" - - "cmpq $0, %0 \n\t" - "je 4f \n\t" - - "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" - "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" - "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" - "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" - - "vmulpd -64(%1), %%xmm0, %%xmm8 \n\t" - "vmulpd -48(%1), %%xmm0, %%xmm9 \n\t" - "vmulpd -32(%1), %%xmm0, %%xmm10 \n\t" - "vmulpd -16(%1), %%xmm0, %%xmm11 \n\t" - - "subq $1 , %0 \n\t" - "jz 2f \n\t" - - ".p2align 4 \n\t" - "1: \n\t" - // "prefetcht0 640(%1) \n\t" - - "vmovups %%xmm4 ,-128(%1) \n\t" - "vmovups %%xmm5 ,-112(%1) \n\t" - "vmulpd 0(%1), %%xmm0, %%xmm4 \n\t" - "vmovups %%xmm6 , -96(%1) \n\t" - "vmulpd 16(%1), %%xmm0, %%xmm5 \n\t" - "vmovups %%xmm7 , -80(%1) \n\t" - "vmulpd 32(%1), %%xmm0, %%xmm6 \n\t" - - // "prefetcht0 704(%1) \n\t" - - "vmovups %%xmm8 , -64(%1) \n\t" - "vmulpd 48(%1), %%xmm0, %%xmm7 \n\t" - "vmovups %%xmm9 , -48(%1) \n\t" - "vmulpd 64(%1), %%xmm0, %%xmm8 \n\t" - "vmovups %%xmm10 , -32(%1) \n\t" - "vmulpd 80(%1), %%xmm0, %%xmm9 \n\t" - "vmovups %%xmm11 , -16(%1) \n\t" - - "vmulpd 96(%1), %%xmm0, %%xmm10 \n\t" - "vmulpd 112(%1), %%xmm0, %%xmm11 \n\t" - - - "addq $128, %1 \n\t" - "subq $1 , %0 \n\t" - "jnz 1b \n\t" - - "2: \n\t" - - "vmovups %%xmm4 ,-128(%1) \n\t" - "vmovups %%xmm5 ,-112(%1) \n\t" - "vmovups %%xmm6 , -96(%1) \n\t" - "vmovups %%xmm7 , -80(%1) \n\t" - - "vmovups %%xmm8 , -64(%1) \n\t" - "vmovups %%xmm9 , -48(%1) \n\t" - "vmovups %%xmm10 , -32(%1) \n\t" - "vmovups %%xmm11 , -16(%1) \n\t" - - "addq $128, %1 \n\t" - - "4: \n\t" - - "cmpq $8 ,%3 \n\t" - "jne 5f \n\t" - - "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" - "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" - "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" - "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" - - "vmovups %%xmm4 ,-128(%1) \n\t" - "vmovups %%xmm5 ,-112(%1) \n\t" - "vmovups %%xmm6 , -96(%1) \n\t" - "vmovups %%xmm7 , -80(%1) \n\t" - - "5: \n\t" - - "vzeroupper \n\t" - - : - : - "r" (n1), // 0 - "r" (x), // 1 - "r" (alpha), // 2 - "r" (n2) // 3 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - +#ifdef __AVX512CD__ + __m512d __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha)); + for (; i < n; i += 8) { + _mm512_storeu_pd(&x[i + 0], __alpha5 * _mm512_loadu_pd(&x[i + 0])); + } +#else + __m256d __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); + for (; i < n; i += 8) { + _mm256_storeu_pd(&x[i + 0], __alpha * _mm256_loadu_pd(&x[i + 0])); + _mm256_storeu_pd(&x[i + 4], __alpha * _mm256_loadu_pd(&x[i + 4])); + } +#endif } -static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); - static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) { + int i = 0; + /* question to self: Why is this not just memset() */ - BLASLONG n1 = n >> 4 ; - BLASLONG n2 = n & 8 ; - - __asm__ __volatile__ - ( - "vxorpd %%xmm0, %%xmm0 , %%xmm0 \n\t" - - "addq $128, %1 \n\t" - - "cmpq $0, %0 \n\t" - "je 2f \n\t" - - ".p2align 4 \n\t" - "1: \n\t" - - "vmovups %%xmm0 ,-128(%1) \n\t" - "vmovups %%xmm0 ,-112(%1) \n\t" - "vmovups %%xmm0 , -96(%1) \n\t" - "vmovups %%xmm0 , -80(%1) \n\t" - - "vmovups %%xmm0 , -64(%1) \n\t" - "vmovups %%xmm0 , -48(%1) \n\t" - "vmovups %%xmm0 , -32(%1) \n\t" - "vmovups %%xmm0 , -16(%1) \n\t" - - "addq $128, %1 \n\t" - "subq $1 , %0 \n\t" - "jnz 1b \n\t" - - "2: \n\t" - - "cmpq $8 ,%3 \n\t" - "jne 4f \n\t" - - "vmovups %%xmm0 ,-128(%1) \n\t" - "vmovups %%xmm0 ,-112(%1) \n\t" - "vmovups %%xmm0 , -96(%1) \n\t" - "vmovups %%xmm0 , -80(%1) \n\t" - - "4: \n\t" - - "vzeroupper \n\t" - - : - : - "r" (n1), // 0 - "r" (x), // 1 - "r" (alpha), // 2 - "r" (n2) // 3 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); +#ifdef __AVX512CD__ + __m512d zero = _mm512_setzero_pd(); + for (; i < n; i += 8) { + _mm512_storeu_pd(&x[i], zero); + } +#else + __m256d zero = _mm256_setzero_pd(); + for (; i < n; i += 8) { + _mm256_storeu_pd(&x[i + 0], zero); + _mm256_storeu_pd(&x[i + 4], zero); + } +#endif } - +#endif