dgemm/skylakex: replace discrete mul/add with fma

very minor gains since it's not super hot code, but general principles
This commit is contained in:
Arjan van de Ven 2018-10-06 23:13:26 +00:00
parent adbf6afa25
commit 582c589727
1 changed files with 49 additions and 106 deletions

View File

@ -927,39 +927,15 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
"jg .label24\n"
/* multiply the result by alpha */
"vbroadcastsd (%[alpha]), %%zmm9\n"
"vmulpd %%zmm9, %%zmm1, %%zmm1\n"
"vmulpd %%zmm9, %%zmm2, %%zmm2\n"
"vmulpd %%zmm9, %%zmm3, %%zmm3\n"
"vmulpd %%zmm9, %%zmm4, %%zmm4\n"
"vmulpd %%zmm9, %%zmm5, %%zmm5\n"
"vmulpd %%zmm9, %%zmm6, %%zmm6\n"
"vmulpd %%zmm9, %%zmm7, %%zmm7\n"
"vmulpd %%zmm9, %%zmm8, %%zmm8\n"
"vmulpd %%zmm9, %%zmm11, %%zmm11\n"
"vmulpd %%zmm9, %%zmm12, %%zmm12\n"
"vmulpd %%zmm9, %%zmm13, %%zmm13\n"
"vmulpd %%zmm9, %%zmm14, %%zmm14\n"
"vmulpd %%zmm9, %%zmm15, %%zmm15\n"
"vmulpd %%zmm9, %%zmm16, %%zmm16\n"
"vmulpd %%zmm9, %%zmm17, %%zmm17\n"
"vmulpd %%zmm9, %%zmm18, %%zmm18\n"
"vmulpd %%zmm9, %%zmm21, %%zmm21\n"
"vmulpd %%zmm9, %%zmm22, %%zmm22\n"
"vmulpd %%zmm9, %%zmm23, %%zmm23\n"
"vmulpd %%zmm9, %%zmm24, %%zmm24\n"
"vmulpd %%zmm9, %%zmm25, %%zmm25\n"
"vmulpd %%zmm9, %%zmm26, %%zmm26\n"
"vmulpd %%zmm9, %%zmm27, %%zmm27\n"
"vmulpd %%zmm9, %%zmm28, %%zmm28\n"
/* And store additively in C */
"vaddpd (%[C0]), %%zmm1, %%zmm1\n"
"vaddpd (%[C1]), %%zmm2, %%zmm2\n"
"vaddpd (%[C2]), %%zmm3, %%zmm3\n"
"vaddpd (%[C3]), %%zmm4, %%zmm4\n"
"vaddpd (%[C4]), %%zmm5, %%zmm5\n"
"vaddpd (%[C5]), %%zmm6, %%zmm6\n"
"vaddpd (%[C6]), %%zmm7, %%zmm7\n"
"vaddpd (%[C7]), %%zmm8, %%zmm8\n"
"vfmadd213pd (%[C0]), %%zmm9, %%zmm1\n"
"vfmadd213pd (%[C1]), %%zmm9, %%zmm2\n"
"vfmadd213pd (%[C2]), %%zmm9, %%zmm3\n"
"vfmadd213pd (%[C3]), %%zmm9, %%zmm4\n"
"vfmadd213pd (%[C4]), %%zmm9, %%zmm5\n"
"vfmadd213pd (%[C5]), %%zmm9, %%zmm6\n"
"vfmadd213pd (%[C6]), %%zmm9, %%zmm7\n"
"vfmadd213pd (%[C7]), %%zmm9, %%zmm8\n"
"vmovupd %%zmm1, (%[C0])\n"
"vmovupd %%zmm2, (%[C1])\n"
"vmovupd %%zmm3, (%[C2])\n"
@ -969,14 +945,14 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
"vmovupd %%zmm7, (%[C6])\n"
"vmovupd %%zmm8, (%[C7])\n"
"vaddpd 64(%[C0]), %%zmm11, %%zmm11\n"
"vaddpd 64(%[C1]), %%zmm12, %%zmm12\n"
"vaddpd 64(%[C2]), %%zmm13, %%zmm13\n"
"vaddpd 64(%[C3]), %%zmm14, %%zmm14\n"
"vaddpd 64(%[C4]), %%zmm15, %%zmm15\n"
"vaddpd 64(%[C5]), %%zmm16, %%zmm16\n"
"vaddpd 64(%[C6]), %%zmm17, %%zmm17\n"
"vaddpd 64(%[C7]), %%zmm18, %%zmm18\n"
"vfmadd213pd 64(%[C0]), %%zmm9, %%zmm11\n"
"vfmadd213pd 64(%[C1]), %%zmm9, %%zmm12\n"
"vfmadd213pd 64(%[C2]), %%zmm9, %%zmm13\n"
"vfmadd213pd 64(%[C3]), %%zmm9, %%zmm14\n"
"vfmadd213pd 64(%[C4]), %%zmm9, %%zmm15\n"
"vfmadd213pd 64(%[C5]), %%zmm9, %%zmm16\n"
"vfmadd213pd 64(%[C6]), %%zmm9, %%zmm17\n"
"vfmadd213pd 64(%[C7]), %%zmm9, %%zmm18\n"
"vmovupd %%zmm11, 64(%[C0])\n"
"vmovupd %%zmm12, 64(%[C1])\n"
"vmovupd %%zmm13, 64(%[C2])\n"
@ -986,14 +962,14 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
"vmovupd %%zmm17, 64(%[C6])\n"
"vmovupd %%zmm18, 64(%[C7])\n"
"vaddpd 128(%[C0]), %%zmm21, %%zmm21\n"
"vaddpd 128(%[C1]), %%zmm22, %%zmm22\n"
"vaddpd 128(%[C2]), %%zmm23, %%zmm23\n"
"vaddpd 128(%[C3]), %%zmm24, %%zmm24\n"
"vaddpd 128(%[C4]), %%zmm25, %%zmm25\n"
"vaddpd 128(%[C5]), %%zmm26, %%zmm26\n"
"vaddpd 128(%[C6]), %%zmm27, %%zmm27\n"
"vaddpd 128(%[C7]), %%zmm28, %%zmm28\n"
"vfmadd213pd 128(%[C0]), %%zmm9, %%zmm21\n"
"vfmadd213pd 128(%[C1]), %%zmm9, %%zmm22\n"
"vfmadd213pd 128(%[C2]), %%zmm9, %%zmm23\n"
"vfmadd213pd 128(%[C3]), %%zmm9, %%zmm24\n"
"vfmadd213pd 128(%[C4]), %%zmm9, %%zmm25\n"
"vfmadd213pd 128(%[C5]), %%zmm9, %%zmm26\n"
"vfmadd213pd 128(%[C6]), %%zmm9, %%zmm27\n"
"vfmadd213pd 128(%[C7]), %%zmm9, %%zmm28\n"
"vmovupd %%zmm21, 128(%[C0])\n"
"vmovupd %%zmm22, 128(%[C1])\n"
"vmovupd %%zmm23, 128(%[C2])\n"
@ -1108,31 +1084,15 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
"jg .label16\n"
/* multiply the result by alpha */
"vbroadcastsd (%[alpha]), %%zmm9\n"
"vmulpd %%zmm9, %%zmm1, %%zmm1\n"
"vmulpd %%zmm9, %%zmm2, %%zmm2\n"
"vmulpd %%zmm9, %%zmm3, %%zmm3\n"
"vmulpd %%zmm9, %%zmm4, %%zmm4\n"
"vmulpd %%zmm9, %%zmm5, %%zmm5\n"
"vmulpd %%zmm9, %%zmm6, %%zmm6\n"
"vmulpd %%zmm9, %%zmm7, %%zmm7\n"
"vmulpd %%zmm9, %%zmm8, %%zmm8\n"
"vmulpd %%zmm9, %%zmm11, %%zmm11\n"
"vmulpd %%zmm9, %%zmm12, %%zmm12\n"
"vmulpd %%zmm9, %%zmm13, %%zmm13\n"
"vmulpd %%zmm9, %%zmm14, %%zmm14\n"
"vmulpd %%zmm9, %%zmm15, %%zmm15\n"
"vmulpd %%zmm9, %%zmm16, %%zmm16\n"
"vmulpd %%zmm9, %%zmm17, %%zmm17\n"
"vmulpd %%zmm9, %%zmm18, %%zmm18\n"
/* And store additively in C */
"vaddpd (%[C0]), %%zmm1, %%zmm1\n"
"vaddpd (%[C1]), %%zmm2, %%zmm2\n"
"vaddpd (%[C2]), %%zmm3, %%zmm3\n"
"vaddpd (%[C3]), %%zmm4, %%zmm4\n"
"vaddpd (%[C4]), %%zmm5, %%zmm5\n"
"vaddpd (%[C5]), %%zmm6, %%zmm6\n"
"vaddpd (%[C6]), %%zmm7, %%zmm7\n"
"vaddpd (%[C7]), %%zmm8, %%zmm8\n"
"vfmadd213pd (%[C0]), %%zmm9, %%zmm1\n"
"vfmadd213pd (%[C1]), %%zmm9, %%zmm2\n"
"vfmadd213pd (%[C2]), %%zmm9, %%zmm3\n"
"vfmadd213pd (%[C3]), %%zmm9, %%zmm4\n"
"vfmadd213pd (%[C4]), %%zmm9, %%zmm5\n"
"vfmadd213pd (%[C5]), %%zmm9, %%zmm6\n"
"vfmadd213pd (%[C6]), %%zmm9, %%zmm7\n"
"vfmadd213pd (%[C7]), %%zmm9, %%zmm8\n"
"vmovupd %%zmm1, (%[C0])\n"
"vmovupd %%zmm2, (%[C1])\n"
"vmovupd %%zmm3, (%[C2])\n"
@ -1142,14 +1102,14 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
"vmovupd %%zmm7, (%[C6])\n"
"vmovupd %%zmm8, (%[C7])\n"
"vaddpd 64(%[C0]), %%zmm11, %%zmm11\n"
"vaddpd 64(%[C1]), %%zmm12, %%zmm12\n"
"vaddpd 64(%[C2]), %%zmm13, %%zmm13\n"
"vaddpd 64(%[C3]), %%zmm14, %%zmm14\n"
"vaddpd 64(%[C4]), %%zmm15, %%zmm15\n"
"vaddpd 64(%[C5]), %%zmm16, %%zmm16\n"
"vaddpd 64(%[C6]), %%zmm17, %%zmm17\n"
"vaddpd 64(%[C7]), %%zmm18, %%zmm18\n"
"vfmadd213pd 64(%[C0]), %%zmm9, %%zmm11\n"
"vfmadd213pd 64(%[C1]), %%zmm9, %%zmm12\n"
"vfmadd213pd 64(%[C2]), %%zmm9, %%zmm13\n"
"vfmadd213pd 64(%[C3]), %%zmm9, %%zmm14\n"
"vfmadd213pd 64(%[C4]), %%zmm9, %%zmm15\n"
"vfmadd213pd 64(%[C5]), %%zmm9, %%zmm16\n"
"vfmadd213pd 64(%[C6]), %%zmm9, %%zmm17\n"
"vfmadd213pd 64(%[C7]), %%zmm9, %%zmm18\n"
"vmovupd %%zmm11, 64(%[C0])\n"
"vmovupd %%zmm12, 64(%[C1])\n"
"vmovupd %%zmm13, 64(%[C2])\n"
@ -1221,24 +1181,15 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
"add $64, %[BO]\n"
"subl $1, %[kloop]\n"
"jg .label1\n"
/* multiply the result by alpha */
"vmulpd %%zmm9, %%zmm1, %%zmm1\n"
"vmulpd %%zmm9, %%zmm2, %%zmm2\n"
"vmulpd %%zmm9, %%zmm3, %%zmm3\n"
"vmulpd %%zmm9, %%zmm4, %%zmm4\n"
"vmulpd %%zmm9, %%zmm5, %%zmm5\n"
"vmulpd %%zmm9, %%zmm6, %%zmm6\n"
"vmulpd %%zmm9, %%zmm7, %%zmm7\n"
"vmulpd %%zmm9, %%zmm8, %%zmm8\n"
/* And store additively in C */
"vaddpd (%[C0]), %%zmm1, %%zmm1\n"
"vaddpd (%[C1]), %%zmm2, %%zmm2\n"
"vaddpd (%[C2]), %%zmm3, %%zmm3\n"
"vaddpd (%[C3]), %%zmm4, %%zmm4\n"
"vaddpd (%[C4]), %%zmm5, %%zmm5\n"
"vaddpd (%[C5]), %%zmm6, %%zmm6\n"
"vaddpd (%[C6]), %%zmm7, %%zmm7\n"
"vaddpd (%[C7]), %%zmm8, %%zmm8\n"
/* multiply the result by alpha and add to the memory */
"vfmadd213pd (%[C0]), %%zmm9, %%zmm1\n"
"vfmadd213pd (%[C1]), %%zmm9, %%zmm2\n"
"vfmadd213pd (%[C2]), %%zmm9, %%zmm3\n"
"vfmadd213pd (%[C3]), %%zmm9, %%zmm4\n"
"vfmadd213pd (%[C4]), %%zmm9, %%zmm5\n"
"vfmadd213pd (%[C5]), %%zmm9, %%zmm6\n"
"vfmadd213pd (%[C6]), %%zmm9, %%zmm7\n"
"vfmadd213pd (%[C7]), %%zmm9, %%zmm8\n"
"vmovupd %%zmm1, (%[C0])\n"
"vmovupd %%zmm2, (%[C1])\n"
"vmovupd %%zmm3, (%[C2])\n"
@ -1247,14 +1198,6 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
"vmovupd %%zmm6, (%[C5])\n"
"vmovupd %%zmm7, (%[C6])\n"
"vmovupd %%zmm8, (%[C7])\n"
"prefetchw 64(%[C0])\n"
"prefetchw 64(%[C1])\n"
"prefetchw 64(%[C2])\n"
"prefetchw 64(%[C3])\n"
"prefetchw 64(%[C4])\n"
"prefetchw 64(%[C5])\n"
"prefetchw 64(%[C6])\n"
"prefetchw 64(%[C7])\n"
:
[AO] "+r" (AO),
[BO] "+r" (BO),