From 582c589727302938e99bf594bf072d3d9913575e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 6 Oct 2018 23:13:26 +0000 Subject: [PATCH] dgemm/skylakex: replace discrete mul/add with fma very minor gains since it's not super hot code, but general principles --- kernel/x86_64/dgemm_kernel_4x8_skylakex.c | 155 +++++++--------------- 1 file changed, 49 insertions(+), 106 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c index bb121ca69..a83ca98fa 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c @@ -927,39 +927,15 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "jg .label24\n" /* multiply the result by alpha */ "vbroadcastsd (%[alpha]), %%zmm9\n" - "vmulpd %%zmm9, %%zmm1, %%zmm1\n" - "vmulpd %%zmm9, %%zmm2, %%zmm2\n" - "vmulpd %%zmm9, %%zmm3, %%zmm3\n" - "vmulpd %%zmm9, %%zmm4, %%zmm4\n" - "vmulpd %%zmm9, %%zmm5, %%zmm5\n" - "vmulpd %%zmm9, %%zmm6, %%zmm6\n" - "vmulpd %%zmm9, %%zmm7, %%zmm7\n" - "vmulpd %%zmm9, %%zmm8, %%zmm8\n" - "vmulpd %%zmm9, %%zmm11, %%zmm11\n" - "vmulpd %%zmm9, %%zmm12, %%zmm12\n" - "vmulpd %%zmm9, %%zmm13, %%zmm13\n" - "vmulpd %%zmm9, %%zmm14, %%zmm14\n" - "vmulpd %%zmm9, %%zmm15, %%zmm15\n" - "vmulpd %%zmm9, %%zmm16, %%zmm16\n" - "vmulpd %%zmm9, %%zmm17, %%zmm17\n" - "vmulpd %%zmm9, %%zmm18, %%zmm18\n" - "vmulpd %%zmm9, %%zmm21, %%zmm21\n" - "vmulpd %%zmm9, %%zmm22, %%zmm22\n" - "vmulpd %%zmm9, %%zmm23, %%zmm23\n" - "vmulpd %%zmm9, %%zmm24, %%zmm24\n" - "vmulpd %%zmm9, %%zmm25, %%zmm25\n" - "vmulpd %%zmm9, %%zmm26, %%zmm26\n" - "vmulpd %%zmm9, %%zmm27, %%zmm27\n" - "vmulpd %%zmm9, %%zmm28, %%zmm28\n" /* And store additively in C */ - "vaddpd (%[C0]), %%zmm1, %%zmm1\n" - "vaddpd (%[C1]), %%zmm2, %%zmm2\n" - "vaddpd (%[C2]), %%zmm3, %%zmm3\n" - "vaddpd (%[C3]), %%zmm4, %%zmm4\n" - "vaddpd (%[C4]), %%zmm5, %%zmm5\n" - "vaddpd (%[C5]), %%zmm6, %%zmm6\n" - "vaddpd (%[C6]), %%zmm7, %%zmm7\n" - "vaddpd (%[C7]), %%zmm8, %%zmm8\n" + "vfmadd213pd (%[C0]), %%zmm9, %%zmm1\n" + "vfmadd213pd (%[C1]), %%zmm9, %%zmm2\n" + "vfmadd213pd (%[C2]), %%zmm9, %%zmm3\n" + "vfmadd213pd (%[C3]), %%zmm9, %%zmm4\n" + "vfmadd213pd (%[C4]), %%zmm9, %%zmm5\n" + "vfmadd213pd (%[C5]), %%zmm9, %%zmm6\n" + "vfmadd213pd (%[C6]), %%zmm9, %%zmm7\n" + "vfmadd213pd (%[C7]), %%zmm9, %%zmm8\n" "vmovupd %%zmm1, (%[C0])\n" "vmovupd %%zmm2, (%[C1])\n" "vmovupd %%zmm3, (%[C2])\n" @@ -969,14 +945,14 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "vmovupd %%zmm7, (%[C6])\n" "vmovupd %%zmm8, (%[C7])\n" - "vaddpd 64(%[C0]), %%zmm11, %%zmm11\n" - "vaddpd 64(%[C1]), %%zmm12, %%zmm12\n" - "vaddpd 64(%[C2]), %%zmm13, %%zmm13\n" - "vaddpd 64(%[C3]), %%zmm14, %%zmm14\n" - "vaddpd 64(%[C4]), %%zmm15, %%zmm15\n" - "vaddpd 64(%[C5]), %%zmm16, %%zmm16\n" - "vaddpd 64(%[C6]), %%zmm17, %%zmm17\n" - "vaddpd 64(%[C7]), %%zmm18, %%zmm18\n" + "vfmadd213pd 64(%[C0]), %%zmm9, %%zmm11\n" + "vfmadd213pd 64(%[C1]), %%zmm9, %%zmm12\n" + "vfmadd213pd 64(%[C2]), %%zmm9, %%zmm13\n" + "vfmadd213pd 64(%[C3]), %%zmm9, %%zmm14\n" + "vfmadd213pd 64(%[C4]), %%zmm9, %%zmm15\n" + "vfmadd213pd 64(%[C5]), %%zmm9, %%zmm16\n" + "vfmadd213pd 64(%[C6]), %%zmm9, %%zmm17\n" + "vfmadd213pd 64(%[C7]), %%zmm9, %%zmm18\n" "vmovupd %%zmm11, 64(%[C0])\n" "vmovupd %%zmm12, 64(%[C1])\n" "vmovupd %%zmm13, 64(%[C2])\n" @@ -986,14 +962,14 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "vmovupd %%zmm17, 64(%[C6])\n" "vmovupd %%zmm18, 64(%[C7])\n" - "vaddpd 128(%[C0]), %%zmm21, %%zmm21\n" - "vaddpd 128(%[C1]), %%zmm22, %%zmm22\n" - "vaddpd 128(%[C2]), %%zmm23, %%zmm23\n" - "vaddpd 128(%[C3]), %%zmm24, %%zmm24\n" - "vaddpd 128(%[C4]), %%zmm25, %%zmm25\n" - "vaddpd 128(%[C5]), %%zmm26, %%zmm26\n" - "vaddpd 128(%[C6]), %%zmm27, %%zmm27\n" - "vaddpd 128(%[C7]), %%zmm28, %%zmm28\n" + "vfmadd213pd 128(%[C0]), %%zmm9, %%zmm21\n" + "vfmadd213pd 128(%[C1]), %%zmm9, %%zmm22\n" + "vfmadd213pd 128(%[C2]), %%zmm9, %%zmm23\n" + "vfmadd213pd 128(%[C3]), %%zmm9, %%zmm24\n" + "vfmadd213pd 128(%[C4]), %%zmm9, %%zmm25\n" + "vfmadd213pd 128(%[C5]), %%zmm9, %%zmm26\n" + "vfmadd213pd 128(%[C6]), %%zmm9, %%zmm27\n" + "vfmadd213pd 128(%[C7]), %%zmm9, %%zmm28\n" "vmovupd %%zmm21, 128(%[C0])\n" "vmovupd %%zmm22, 128(%[C1])\n" "vmovupd %%zmm23, 128(%[C2])\n" @@ -1108,31 +1084,15 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "jg .label16\n" /* multiply the result by alpha */ "vbroadcastsd (%[alpha]), %%zmm9\n" - "vmulpd %%zmm9, %%zmm1, %%zmm1\n" - "vmulpd %%zmm9, %%zmm2, %%zmm2\n" - "vmulpd %%zmm9, %%zmm3, %%zmm3\n" - "vmulpd %%zmm9, %%zmm4, %%zmm4\n" - "vmulpd %%zmm9, %%zmm5, %%zmm5\n" - "vmulpd %%zmm9, %%zmm6, %%zmm6\n" - "vmulpd %%zmm9, %%zmm7, %%zmm7\n" - "vmulpd %%zmm9, %%zmm8, %%zmm8\n" - "vmulpd %%zmm9, %%zmm11, %%zmm11\n" - "vmulpd %%zmm9, %%zmm12, %%zmm12\n" - "vmulpd %%zmm9, %%zmm13, %%zmm13\n" - "vmulpd %%zmm9, %%zmm14, %%zmm14\n" - "vmulpd %%zmm9, %%zmm15, %%zmm15\n" - "vmulpd %%zmm9, %%zmm16, %%zmm16\n" - "vmulpd %%zmm9, %%zmm17, %%zmm17\n" - "vmulpd %%zmm9, %%zmm18, %%zmm18\n" /* And store additively in C */ - "vaddpd (%[C0]), %%zmm1, %%zmm1\n" - "vaddpd (%[C1]), %%zmm2, %%zmm2\n" - "vaddpd (%[C2]), %%zmm3, %%zmm3\n" - "vaddpd (%[C3]), %%zmm4, %%zmm4\n" - "vaddpd (%[C4]), %%zmm5, %%zmm5\n" - "vaddpd (%[C5]), %%zmm6, %%zmm6\n" - "vaddpd (%[C6]), %%zmm7, %%zmm7\n" - "vaddpd (%[C7]), %%zmm8, %%zmm8\n" + "vfmadd213pd (%[C0]), %%zmm9, %%zmm1\n" + "vfmadd213pd (%[C1]), %%zmm9, %%zmm2\n" + "vfmadd213pd (%[C2]), %%zmm9, %%zmm3\n" + "vfmadd213pd (%[C3]), %%zmm9, %%zmm4\n" + "vfmadd213pd (%[C4]), %%zmm9, %%zmm5\n" + "vfmadd213pd (%[C5]), %%zmm9, %%zmm6\n" + "vfmadd213pd (%[C6]), %%zmm9, %%zmm7\n" + "vfmadd213pd (%[C7]), %%zmm9, %%zmm8\n" "vmovupd %%zmm1, (%[C0])\n" "vmovupd %%zmm2, (%[C1])\n" "vmovupd %%zmm3, (%[C2])\n" @@ -1142,14 +1102,14 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "vmovupd %%zmm7, (%[C6])\n" "vmovupd %%zmm8, (%[C7])\n" - "vaddpd 64(%[C0]), %%zmm11, %%zmm11\n" - "vaddpd 64(%[C1]), %%zmm12, %%zmm12\n" - "vaddpd 64(%[C2]), %%zmm13, %%zmm13\n" - "vaddpd 64(%[C3]), %%zmm14, %%zmm14\n" - "vaddpd 64(%[C4]), %%zmm15, %%zmm15\n" - "vaddpd 64(%[C5]), %%zmm16, %%zmm16\n" - "vaddpd 64(%[C6]), %%zmm17, %%zmm17\n" - "vaddpd 64(%[C7]), %%zmm18, %%zmm18\n" + "vfmadd213pd 64(%[C0]), %%zmm9, %%zmm11\n" + "vfmadd213pd 64(%[C1]), %%zmm9, %%zmm12\n" + "vfmadd213pd 64(%[C2]), %%zmm9, %%zmm13\n" + "vfmadd213pd 64(%[C3]), %%zmm9, %%zmm14\n" + "vfmadd213pd 64(%[C4]), %%zmm9, %%zmm15\n" + "vfmadd213pd 64(%[C5]), %%zmm9, %%zmm16\n" + "vfmadd213pd 64(%[C6]), %%zmm9, %%zmm17\n" + "vfmadd213pd 64(%[C7]), %%zmm9, %%zmm18\n" "vmovupd %%zmm11, 64(%[C0])\n" "vmovupd %%zmm12, 64(%[C1])\n" "vmovupd %%zmm13, 64(%[C2])\n" @@ -1221,24 +1181,15 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "add $64, %[BO]\n" "subl $1, %[kloop]\n" "jg .label1\n" - /* multiply the result by alpha */ - "vmulpd %%zmm9, %%zmm1, %%zmm1\n" - "vmulpd %%zmm9, %%zmm2, %%zmm2\n" - "vmulpd %%zmm9, %%zmm3, %%zmm3\n" - "vmulpd %%zmm9, %%zmm4, %%zmm4\n" - "vmulpd %%zmm9, %%zmm5, %%zmm5\n" - "vmulpd %%zmm9, %%zmm6, %%zmm6\n" - "vmulpd %%zmm9, %%zmm7, %%zmm7\n" - "vmulpd %%zmm9, %%zmm8, %%zmm8\n" - /* And store additively in C */ - "vaddpd (%[C0]), %%zmm1, %%zmm1\n" - "vaddpd (%[C1]), %%zmm2, %%zmm2\n" - "vaddpd (%[C2]), %%zmm3, %%zmm3\n" - "vaddpd (%[C3]), %%zmm4, %%zmm4\n" - "vaddpd (%[C4]), %%zmm5, %%zmm5\n" - "vaddpd (%[C5]), %%zmm6, %%zmm6\n" - "vaddpd (%[C6]), %%zmm7, %%zmm7\n" - "vaddpd (%[C7]), %%zmm8, %%zmm8\n" + /* multiply the result by alpha and add to the memory */ + "vfmadd213pd (%[C0]), %%zmm9, %%zmm1\n" + "vfmadd213pd (%[C1]), %%zmm9, %%zmm2\n" + "vfmadd213pd (%[C2]), %%zmm9, %%zmm3\n" + "vfmadd213pd (%[C3]), %%zmm9, %%zmm4\n" + "vfmadd213pd (%[C4]), %%zmm9, %%zmm5\n" + "vfmadd213pd (%[C5]), %%zmm9, %%zmm6\n" + "vfmadd213pd (%[C6]), %%zmm9, %%zmm7\n" + "vfmadd213pd (%[C7]), %%zmm9, %%zmm8\n" "vmovupd %%zmm1, (%[C0])\n" "vmovupd %%zmm2, (%[C1])\n" "vmovupd %%zmm3, (%[C2])\n" @@ -1247,14 +1198,6 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "vmovupd %%zmm6, (%[C5])\n" "vmovupd %%zmm7, (%[C6])\n" "vmovupd %%zmm8, (%[C7])\n" - "prefetchw 64(%[C0])\n" - "prefetchw 64(%[C1])\n" - "prefetchw 64(%[C2])\n" - "prefetchw 64(%[C3])\n" - "prefetchw 64(%[C4])\n" - "prefetchw 64(%[C5])\n" - "prefetchw 64(%[C6])\n" - "prefetchw 64(%[C7])\n" : [AO] "+r" (AO), [BO] "+r" (BO),