From 8019e70211f5e6679e1e5afd5658016d8045de19 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 3 Feb 2020 21:32:56 +0800 Subject: [PATCH 01/12] AVX512 16x2 DGEMM kernel --- kernel/x86_64/dgemm_kernel_16x2_skylakex.c | 488 +++++++++++++++++++++ 1 file changed, 488 insertions(+) create mode 100644 kernel/x86_64/dgemm_kernel_16x2_skylakex.c diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c new file mode 100644 index 000000000..250ff8d49 --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c @@ -0,0 +1,488 @@ +#if (defined (LEFT) && !defined(TRANSA)) || (!defined (LEFT) && defined(TRANSA)) + #define BACKWARDS 1 +#else + #define BACKWARDS 0 +#endif +#define GEMM_SET_PB "movq %%r14,%1; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;" +#define set_p_copy1(ptr) "sarq $1,%%r12; addq %%r12,"#ptr"; salq $1,%%r12; salq $3,%%r13; subq %%r13,"#ptr"; sarq $3,%%r13;" +#define set_p_copy2(ptr) "addq %%r12,"#ptr"; salq $4,%%r13; subq %%r13,"#ptr"; sarq $4,%%r13;" +#define set_p_copy4(ptr) "leaq ("#ptr",%%r12,2),"#ptr"; salq $5,%%r13; subq %%r13,"#ptr"; sarq $5,%%r13;" +#define set_p_copy8(ptr) "leaq ("#ptr",%%r12,4),"#ptr"; salq $6,%%r13; subq %%r13,"#ptr"; sarq $6,%%r13;" +#define set_p_copy16(ptr) "leaq ("#ptr",%%r12,8),"#ptr"; salq $7,%%r13; subq %%r13,"#ptr"; sarq $7,%%r13;" +#define set_p_b_dim1(ptr) set_p_copy1(ptr) +#define set_p_b_dim2(ptr) set_p_copy2(ptr) +#define set_p_b_dim4(ptr) set_p_copy2(ptr) +#define set_p_b_dim6(ptr) set_p_copy2(ptr) +#define set_p_b_dim8(ptr) set_p_copy2(ptr) +#define set_p_b_dim10(ptr) set_p_copy2(ptr) +#define set_p_b_dim12(ptr) set_p_copy2(ptr) +#ifdef TRMMKERNEL + #if BACKWARDS == 1 + #define INIT_set_papb(mdim,ndim) GEMM_SET_PB set_p_copy##mdim(%0) set_p_b_dim##ndim(%1) set_p_b_dim##ndim(%%r15) + #define SAVE_set_pa(mdim) "" + #else + #define INIT_set_papb(mdim,ndim) GEMM_SET_PB + #define SAVE_set_pa(mdim) set_p_copy##mdim(%0) + #endif +#else + #define INIT_set_papb(mdim,ndim) GEMM_SET_PB + #define SAVE_set_pa(mdim) "" +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + #if BACKWARDS == 1 + #define HEAD_SET_OFF(ndim) {} + #define TAIL_SET_OFF(ndim) {off += ndim;} + #define kernel_kstart_n4(mdim,updk) KERNEL_k1m##mdim##n2 KERNEL_k1m##mdim##n2 "addq $32,%%r15; "#updk" $2,%5;" + #define kernel_kstart_n6(mdim,updk) kernel_kstart_n4(mdim,updk) KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 "addq $32,%%r15; "#updk" $2,%5;" + #define kernel_kstart_n8(mdim,updk) kernel_kstart_n6(mdim,updk) KERNEL_k1m##mdim##n6 KERNEL_k1m##mdim##n6 "addq $32,%%r15; "#updk" $2,%5;" + #define kernel_kstart_n10(mdim,updk) kernel_kstart_n8(mdim,updk) KERNEL_k1m##mdim##n8 KERNEL_k1m##mdim##n8 #updk" $2,%5;" + #define kernel_kstart_n12(mdim,updk) kernel_kstart_n10(mdim,updk) KERNEL_k1m##mdim##n10 KERNEL_k1m##mdim##n10 #updk" $2,%5;" + #define kernel_kend_n4(mdim) "" + #define kernel_kend_n6(mdim) "" + #define kernel_kend_n8(mdim) "" + #define kernel_kend_n10(mdim) "" + #define kernel_kend_n12(mdim) "" + #else + #define HEAD_SET_OFF(ndim) {off += (ndim > 2 ? 2 : ndim);} + #define TAIL_SET_OFF(ndim) {off += (ndim > 2 ? (ndim-2) : 0);} + #define kernel_kstart_n4(mdim,updk) "" + #define kernel_kstart_n6(mdim,updk) "" + #define kernel_kstart_n8(mdim,updk) "" + #define kernel_kstart_n10(mdim,updk) "" + #define kernel_kstart_n12(mdim,updk) "" + #define kernel_kend_n4(mdim) "xorq %3,%3;"\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) + #define kernel_kend_n6(mdim) "xorq %3,%3;"\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) + #define kernel_kend_n8(mdim) "xorq %3,%3;"\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) + #define kernel_kend_n10(mdim) "xorq %3,%3;"\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8) acc_kend_nc5_k1m##mdim(0,8)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24) acc_kend_nc5_k1m##mdim(16,24)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40) acc_kend_nc5_k1m##mdim(32,40)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56) acc_kend_nc5_k1m##mdim(48,56)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72) acc_kend_nc5_k1m##mdim(64,72)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) acc_kend_nc5_k1m##mdim(80,88)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96,104)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112,120) + #define kernel_kend_n12(mdim) "xorq %3,%3;"\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8) acc_kend_nc5_k1m##mdim(0,8) acc_kend_nc6_k1m##mdim(0,8)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24) acc_kend_nc5_k1m##mdim(16,24) acc_kend_nc6_k1m##mdim(16,24)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40) acc_kend_nc5_k1m##mdim(32,40) acc_kend_nc6_k1m##mdim(32,40)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56) acc_kend_nc5_k1m##mdim(48,56) acc_kend_nc6_k1m##mdim(48,56)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72) acc_kend_nc5_k1m##mdim(64,72) acc_kend_nc6_k1m##mdim(64,72)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) acc_kend_nc5_k1m##mdim(80,88) acc_kend_nc6_k1m##mdim(80,88)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96,104) acc_kend_nc6_k1m##mdim(96,104)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112,120) acc_kend_nc6_k1m##mdim(112,120)\ + loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(128,136)\ + loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(144,152) + #endif +#else + #define HEAD_SET_OFF(ndim) {} + #define TAIL_SET_OFF(ndim) {} + #define kernel_kstart_n4(mdim,updk) "" + #define kernel_kstart_n6(mdim,updk) "" + #define kernel_kstart_n8(mdim,updk) "" + #define kernel_kstart_n10(mdim,updk) "" + #define kernel_kstart_n12(mdim,updk) "" + #define kernel_kend_n4(mdim) "" + #define kernel_kend_n6(mdim) "" + #define kernel_kend_n8(mdim) "" + #define kernel_kend_n10(mdim) "" + #define kernel_kend_n12(mdim) "" +#endif +#define kernel_kstart_n1(mdim,updk) "" +#define kernel_kstart_n2(mdim,updk) "" +#define kernel_kend_n1(mdim) "" +#define kernel_kend_n2(mdim) "" + +#ifdef TRMMKERNEL + #if BACKWARDS == 1 + #define INITASM_SET_K "movq %10,%%r13; subq %9,%%r13;" + #else + #define INITASM_SET_K "movq %9,%%r13;" + #endif +#else + #define INITASM_SET_K "movq %10,%%r13;" +#endif +#if defined(TRMMKERNEL) && defined(LEFT) + #if BACKWARDS==1 + #define init_update_k(mdim) "" + #define save_update_k(mdim) "subq $"#mdim",%%r13;" + #else + #define init_update_k(mdim) "addq $"#mdim",%%r13;" + #define save_update_k(mdim) "" + #endif +#else + #define init_update_k(mdim) "" + #define save_update_k(mdim) "" +#endif + +#define KERNEL_h_k1m16n1 \ + "vmovupd (%0),%%zmm1; vmovupd 64(%0),%%zmm2; addq $128,%0;"\ + "vbroadcastsd (%1),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm8; vfmadd231pd %%zmm2,%%zmm3,%%zmm9;" +#define KERNEL_k1m16n1 KERNEL_h_k1m16n1 "addq $8,%1;" +#define KERNEL_h_k1m16n2 KERNEL_h_k1m16n1\ + "vbroadcastsd 8(%1),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm10; vfmadd231pd %%zmm2,%%zmm4,%%zmm11;" +#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;" +#define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,boff2,...)\ + "vbroadcastsd "#boff1"("#__VA_ARGS__"),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";"\ + "vbroadcastsd "#boff2"("#__VA_ARGS__"),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm"#c3_no"; vfmadd231pd %%zmm2,%%zmm4,%%zmm"#c4_no";" +#define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,8,__VA_ARGS__) +#define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "prefetcht0 384(%0);" unit_acc_m16n2(12,13,14,15,%1,%%r12,1) +#define KERNEL_k1m16n4 KERNEL_h_k1m16n4 "addq $16,%1;" +#define KERNEL_k1m16n6 KERNEL_h_k1m16n4 unit_acc_m16n2(16,17,18,19,%1,%%r12,2) "addq $16,%1;" +#define KERNEL_h_k1m16n8 KERNEL_k1m16n6 "prefetcht0 448(%0);" unit_acc_m16n2(20,21,22,23,%%r15) +#define KERNEL_k1m16n8 KERNEL_h_k1m16n8 "addq $16,%%r15;" +#define KERNEL_h_k1m16n10 KERNEL_h_k1m16n8 unit_acc_m16n2(24,25,26,27,%%r15,%%r12,1) +#define KERNEL_k1m16n10 KERNEL_h_k1m16n10 "addq $16,%%r15;" +#define KERNEL_h_k1m16n12 KERNEL_h_k1m16n10 unit_acc_m16n2(28,29,30,31,%%r15,%%r12,2) +#define KERNEL_k1m16n12 KERNEL_h_k1m16n12 "addq $16,%%r15;" +#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) + #define loada_kend_k1m16 "vmovupd (%0,%3,1),%%zmm1; vmovupd 64(%0,%3,1),%%zmm2; addq $128,%3;" + #define acc_kend_nc2_k1m16(boff1,boff2) unit_acc_gen_m16n2(12,13,14,15,boff1,boff2,%1,%%r12,1) + #define acc_kend_nc3_k1m16(boff1,boff2) unit_acc_gen_m16n2(16,17,18,19,boff1,boff2,%1,%%r12,2) + #define acc_kend_nc4_k1m16(boff1,boff2) unit_acc_gen_m16n2(20,21,22,23,boff1,boff2,%%r15) + #define acc_kend_nc5_k1m16(boff1,boff2) unit_acc_gen_m16n2(24,25,26,27,boff1,boff2,%%r15,%%r12,1) + #define acc_kend_nc6_k1m16(boff1,boff2) unit_acc_gen_m16n2(28,29,30,31,boff1,boff2,%%r15,%%r12,2) +#endif +#define save_init_m16 "movq %2,%3; addq $128,%2;" +#ifdef TRMMKERNEL + #define SAVE_m16n1 "vmulpd %%zmm8,%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vmulpd %%zmm9,%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;" + #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\ + "vmulpd %%zmm"#c1_no",%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vmulpd %%zmm"#c2_no",%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\ + "vmulpd %%zmm"#c3_no",%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vmulpd %%zmm"#c4_no",%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;" +#else + #define SAVE_m16n1 "vfmadd213pd (%2),%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vfmadd213pd 64(%2),%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;" + #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\ + "vfmadd213pd (%3),%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vfmadd213pd 64(%3),%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\ + "vfmadd213pd (%3,%4,1),%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vfmadd213pd 64(%3,%4,1),%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;" +#endif +#define SAVE_m16n2 save_init_m16 unit_save_m16n2(8,9,10,11) +#define SAVE_m16n4 SAVE_m16n2 unit_save_m16n2(12,13,14,15) +#define SAVE_m16n6 SAVE_m16n4 unit_save_m16n2(16,17,18,19) +#define SAVE_m16n8 SAVE_m16n6 unit_save_m16n2(20,21,22,23) +#define SAVE_m16n10 SAVE_m16n8 unit_save_m16n2(24,25,26,27) +#define SAVE_m16n12 SAVE_m16n10 unit_save_m16n2(28,29,30,31) +#define unit_init_2zmm(c1_no,c2_no) "vpxorq %%zmm"#c1_no",%%zmm"#c1_no",%%zmm"#c1_no"; vpxorq %%zmm"#c2_no",%%zmm"#c2_no",%%zmm"#c2_no";" +#define unit_init_4zmm(c1_no,c2_no,c3_no,c4_no) unit_init_2zmm(c1_no,c2_no) unit_init_2zmm(c3_no,c4_no) +#define INIT_m16n1 unit_init_2zmm(8,9) +#define INIT_m16n2 unit_init_4zmm(8,9,10,11) +#define INIT_m16n4 INIT_m16n2 unit_init_4zmm(12,13,14,15) +#define INIT_m16n6 INIT_m16n4 unit_init_4zmm(16,17,18,19) +#define INIT_m16n8 INIT_m16n6 unit_init_4zmm(20,21,22,23) +#define INIT_m16n10 INIT_m16n8 unit_init_4zmm(24,25,26,27) +#define INIT_m16n12 INIT_m16n10 unit_init_4zmm(28,29,30,31) + +#define KERNEL_k1m8n1 \ + "vbroadcastsd (%1),%%zmm1; addq $8,%1;"\ + "vfmadd231pd (%0),%%zmm1,%%zmm8; addq $64,%0;" +#define unit_acc_gen_m8n2(c1_no,c2_no,boff,...)\ + "vbroadcastf32x4 "#boff"("#__VA_ARGS__"),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";" +#define unit_acc_m8n2(c1_no,c2_no,...) unit_acc_gen_m8n2(c1_no,c2_no,0,__VA_ARGS__) +#define KERNEL_h_k1m8n2 \ + "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; addq $64,%0;" unit_acc_m8n2(8,9,%1) +#define KERNEL_k1m8n2 KERNEL_h_k1m8n2 "addq $16,%1;" +#define KERNEL_h_k1m8n4 KERNEL_h_k1m8n2 unit_acc_m8n2(10,11,%1,%%r12,1) +#define KERNEL_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;" +#define KERNEL_k1m8n6 KERNEL_h_k1m8n4 unit_acc_m8n2(12,13,%1,%%r12,2) "addq $16,%1;" +#define KERNEL_h_k1m8n8 KERNEL_k1m8n6 unit_acc_m8n2(14,15,%%r15) +#define KERNEL_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%%r15;" +#define KERNEL_h_k1m8n10 KERNEL_h_k1m8n8 unit_acc_m8n2(16,17,%%r15,%%r12,1) +#define KERNEL_k1m8n10 KERNEL_h_k1m8n10 "addq $16,%%r15;" +#define KERNEL_h_k1m8n12 KERNEL_h_k1m8n10 unit_acc_m8n2(18,19,%%r15,%%r12,2) +#define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%%r15;" +#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) + #define loada_kend_k1m8 "vmovddup (%0,%3,1),%%zmm1; vmovddup 8(%0,%3,1),%%zmm2; addq $64,%3;" + #define acc_kend_nc2_k1m8(boff1,boff2) unit_acc_gen_m8n2(10,11,boff1,%1,%%r12,1) + #define acc_kend_nc3_k1m8(boff1,boff2) unit_acc_gen_m8n2(12,13,boff1,%1,%%r12,2) + #define acc_kend_nc4_k1m8(boff1,boff2) unit_acc_gen_m8n2(14,15,boff1,%%r15) + #define acc_kend_nc5_k1m8(boff1,boff2) unit_acc_gen_m8n2(16,17,boff1,%%r15,%%r12,1) + #define acc_kend_nc6_k1m8(boff1,boff2) unit_acc_gen_m8n2(18,19,boff1,%%r15,%%r12,2) +#endif +#define save_init_m8 "movq %2,%3; addq $64,%2;" +#ifdef TRMMKERNEL + #define SAVE_m8n1 "vmulpd %%zmm8,%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); addq $64,%2;" + #define unit_save_m8n2(c1_no,c2_no)\ + "vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vmulpd %%zmm1,%%zmm0,%%zmm1; vmovupd %%zmm1,(%3);"\ + "vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm2; vmulpd %%zmm2,%%zmm0,%%zmm2; vmovupd %%zmm2,(%3,%4,1); leaq (%3,%4,2),%3;" +#else + #define SAVE_m8n1 "vfmadd213pd (%2),%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); addq $64,%2;" + #define unit_save_m8n2(c1_no,c2_no)\ + "vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vfmadd213pd (%3),%%zmm0,%%zmm1; vmovupd %%zmm1,(%3);"\ + "vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm2; vfmadd213pd (%3,%4,1),%%zmm0,%%zmm2; vmovupd %%zmm2,(%3,%4,1); leaq (%3,%4,2),%3;" +#endif +#define SAVE_m8n2 save_init_m8 unit_save_m8n2(8,9) +#define SAVE_m8n4 SAVE_m8n2 unit_save_m8n2(10,11) +#define SAVE_m8n6 SAVE_m8n4 unit_save_m8n2(12,13) +#define SAVE_m8n8 SAVE_m8n6 unit_save_m8n2(14,15) +#define SAVE_m8n10 SAVE_m8n8 unit_save_m8n2(16,17) +#define SAVE_m8n12 SAVE_m8n10 unit_save_m8n2(18,19) +#define INIT_m8n1 "vpxorq %%zmm8,%%zmm8,%%zmm8;" +#define INIT_m8n2 unit_init_2zmm(8,9) +#define INIT_m8n4 INIT_m8n2 unit_init_2zmm(10,11) +#define INIT_m8n6 INIT_m8n4 unit_init_2zmm(12,13) +#define INIT_m8n8 INIT_m8n6 unit_init_2zmm(14,15) +#define INIT_m8n10 INIT_m8n8 unit_init_2zmm(16,17) +#define INIT_m8n12 INIT_m8n10 unit_init_2zmm(18,19) + +#define KERNEL_k1m4n1 \ + "vbroadcastsd (%1),%%ymm1; addq $8,%1;"\ + "vfmadd231pd (%0),%%ymm1,%%ymm4; addq $32,%0;" +#define unit_acc_gen_m4n2(c1_no,c2_no,boff,...)\ + "vbroadcastf128 "#boff"("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,%%ymm"#c1_no"; vfmadd231pd %%ymm2,%%ymm3,%%ymm"#c2_no";" +#define unit_acc_m4n2(c1_no,c2_no,...) unit_acc_gen_m4n2(c1_no,c2_no,0,__VA_ARGS__) +#define KERNEL_h_k1m4n2 \ + "vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2; addq $32,%0;" unit_acc_m4n2(4,5,%1) +#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $16,%1;" +#define KERNEL_h_k1m4n4 KERNEL_h_k1m4n2 unit_acc_m4n2(6,7,%1,%%r12,1) +#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;" +#define KERNEL_k1m4n6 KERNEL_h_k1m4n4 unit_acc_m4n2(8,9,%1,%%r12,2) "addq $16,%1;" +#define KERNEL_h_k1m4n8 KERNEL_k1m4n6 unit_acc_m4n2(10,11,%%r15) +#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%%r15;" +#define KERNEL_h_k1m4n10 KERNEL_h_k1m4n8 unit_acc_m4n2(12,13,%%r15,%%r12,1) +#define KERNEL_k1m4n10 KERNEL_h_k1m4n10 "addq $16,%%r15;" +#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n10 unit_acc_m4n2(14,15,%%r15,%%r12,2) +#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;" +#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) + #define loada_kend_k1m4 "vmovddup (%0,%3,1),%%ymm1; vmovddup 8(%0,%3,1),%%ymm2; addq $32,%3;" + #define acc_kend_nc2_k1m4(boff1,boff2) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1) + #define acc_kend_nc3_k1m4(boff1,boff2) unit_acc_gen_m4n2(8,9,boff1,%1,%%r12,2) + #define acc_kend_nc4_k1m4(boff1,boff2) unit_acc_gen_m4n2(10,11,boff1,%%r15) + #define acc_kend_nc5_k1m4(boff1,boff2) unit_acc_gen_m4n2(12,13,boff1,%%r15,%%r12,1) + #define acc_kend_nc6_k1m4(boff1,boff2) unit_acc_gen_m4n2(14,15,boff1,%%r15,%%r12,2) +#endif +#define save_init_m4 "movq %2,%3; addq $32,%2;" +#ifdef TRMMKERNEL + #define SAVE_m4n1 "vmulpd %%ymm4,%%ymm0,%%ymm4; vmovupd %%ymm4,(%2); addq $32,%2;" + #define unit_save_m4n2(c1_no,c2_no)\ + "vunpcklpd %%ymm"#c2_no",%%ymm"#c1_no",%%ymm1; vmulpd %%ymm1,%%ymm0,%%ymm1; vmovupd %%ymm1,(%3);"\ + "vunpckhpd %%ymm"#c2_no",%%ymm"#c1_no",%%ymm2; vmulpd %%ymm2,%%ymm0,%%ymm2; vmovupd %%ymm2,(%3,%4,1); leaq (%3,%4,2),%3;" +#else + #define SAVE_m4n1 "vfmadd213pd (%2),%%ymm0,%%ymm4; vmovupd %%ymm4,(%2); addq $32,%2;" + #define unit_save_m4n2(c1_no,c2_no)\ + "vunpcklpd %%ymm"#c2_no",%%ymm"#c1_no",%%ymm1; vfmadd213pd (%3),%%ymm0,%%ymm1; vmovupd %%ymm1,(%3);"\ + "vunpckhpd %%ymm"#c2_no",%%ymm"#c1_no",%%ymm2; vfmadd213pd (%3,%4,1),%%ymm0,%%ymm2; vmovupd %%ymm2,(%3,%4,1); leaq (%3,%4,2),%3;" +#endif +#define SAVE_m4n2 save_init_m4 unit_save_m4n2(4,5) +#define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(6,7) +#define SAVE_m4n6 SAVE_m4n4 unit_save_m4n2(8,9) +#define SAVE_m4n8 SAVE_m4n6 unit_save_m4n2(10,11) +#define SAVE_m4n10 SAVE_m4n8 unit_save_m4n2(12,13) +#define SAVE_m4n12 SAVE_m4n10 unit_save_m4n2(14,15) +#define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4;" +#define unit_init_2ymm(c1_no,c2_no) "vpxor %%ymm"#c1_no",%%ymm"#c1_no",%%ymm"#c1_no"; vpxor %%ymm"#c2_no",%%ymm"#c2_no",%%ymm"#c2_no";" +#define INIT_m4n2 unit_init_2ymm(4,5) +#define INIT_m4n4 INIT_m4n2 unit_init_2ymm(6,7) +#define INIT_m4n6 INIT_m4n4 unit_init_2ymm(8,9) +#define INIT_m4n8 INIT_m4n6 unit_init_2ymm(10,11) +#define INIT_m4n10 INIT_m4n8 unit_init_2ymm(12,13) +#define INIT_m4n12 INIT_m4n10 unit_init_2ymm(14,15) + +#define KERNEL_k1m2n1 \ + "vmovddup (%1),%%xmm1; addq $8,%1;"\ + "vfmadd231pd (%0),%%xmm1,%%xmm4; addq $16,%0;" +#define unit_acc_gen_m2n2(c1_no,c2_no,boff,...)\ + "vmovupd "#boff"("#__VA_ARGS__"),%%xmm3; vfmadd231pd %%xmm1,%%xmm3,%%xmm"#c1_no"; vfmadd231pd %%xmm2,%%xmm3,%%xmm"#c2_no";" +#define unit_acc_m2n2(c1_no,c2_no,...) unit_acc_gen_m2n2(c1_no,c2_no,0,__VA_ARGS__) +#define KERNEL_h_k1m2n2 \ + "vmovddup (%0),%%xmm1; vmovddup 8(%0),%%xmm2; addq $16,%0;" unit_acc_m2n2(4,5,%1) +#define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $16,%1;" +#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 unit_acc_m2n2(6,7,%1,%%r12,1) +#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;" +#define KERNEL_k1m2n6 KERNEL_h_k1m2n4 unit_acc_m2n2(8,9,%1,%%r12,2) "addq $16,%1;" +#define KERNEL_h_k1m2n8 KERNEL_k1m2n6 unit_acc_m2n2(10,11,%%r15) +#define KERNEL_k1m2n8 KERNEL_h_k1m2n8 "addq $16,%%r15;" +#define KERNEL_h_k1m2n10 KERNEL_h_k1m2n8 unit_acc_m2n2(12,13,%%r15,%%r12,1) +#define KERNEL_k1m2n10 KERNEL_h_k1m2n10 "addq $16,%%r15;" +#define KERNEL_h_k1m2n12 KERNEL_h_k1m2n10 unit_acc_m2n2(14,15,%%r15,%%r12,2) +#define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;" +#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) + #define loada_kend_k1m2 "vmovddup (%0,%3,1),%%xmm1; vmovddup 8(%0,%3,1),%%xmm2; addq $16,%3;" + #define acc_kend_nc2_k1m2(boff1,boff2) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1) + #define acc_kend_nc3_k1m2(boff1,boff2) unit_acc_gen_m2n2(8,9,boff1,%1,%%r12,2) + #define acc_kend_nc4_k1m2(boff1,boff2) unit_acc_gen_m2n2(10,11,boff1,%%r15) + #define acc_kend_nc5_k1m2(boff1,boff2) unit_acc_gen_m2n2(12,13,boff1,%%r15,%%r12,1) + #define acc_kend_nc6_k1m2(boff1,boff2) unit_acc_gen_m2n2(14,15,boff1,%%r15,%%r12,2) +#endif +#define save_init_m2 "movq %2,%3; addq $16,%2;" +#ifdef TRMMKERNEL + #define SAVE_m2n1 "vmulpd %%xmm4,%%xmm0,%%xmm4; vmovupd %%xmm4,(%2); addq $16,%2;" + #define unit_save_m2n2(c1_no,c2_no)\ + "vunpcklpd %%xmm"#c2_no",%%xmm"#c1_no",%%xmm1; vmulpd %%xmm1,%%xmm0,%%xmm1; vmovupd %%xmm1,(%3);"\ + "vunpckhpd %%xmm"#c2_no",%%xmm"#c1_no",%%xmm2; vmulpd %%xmm2,%%xmm0,%%xmm2; vmovupd %%xmm2,(%3,%4,1); leaq (%3,%4,2),%3;" +#else + #define SAVE_m2n1 "vfmadd213pd (%2),%%xmm0,%%xmm4; vmovupd %%xmm4,(%2); addq $16,%2;" + #define unit_save_m2n2(c1_no,c2_no)\ + "vunpcklpd %%xmm"#c2_no",%%xmm"#c1_no",%%xmm1; vfmadd213pd (%3),%%xmm0,%%xmm1; vmovupd %%xmm1,(%3);"\ + "vunpckhpd %%xmm"#c2_no",%%xmm"#c1_no",%%xmm2; vfmadd213pd (%3,%4,1),%%xmm0,%%xmm2; vmovupd %%xmm2,(%3,%4,1); leaq (%3,%4,2),%3;" +#endif +#define SAVE_m2n2 save_init_m2 unit_save_m2n2(4,5) +#define SAVE_m2n4 SAVE_m2n2 unit_save_m2n2(6,7) +#define SAVE_m2n6 SAVE_m2n4 unit_save_m2n2(8,9) +#define SAVE_m2n8 SAVE_m2n6 unit_save_m2n2(10,11) +#define SAVE_m2n10 SAVE_m2n8 unit_save_m2n2(12,13) +#define SAVE_m2n12 SAVE_m2n10 unit_save_m2n2(14,15) +#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define unit_init_2xmm(c1_no,c2_no) "vpxor %%xmm"#c1_no",%%xmm"#c1_no",%%xmm"#c1_no"; vpxor %%xmm"#c2_no",%%xmm"#c2_no",%%xmm"#c2_no";" +#define INIT_m2n2 unit_init_2xmm(4,5) +#define INIT_m2n4 INIT_m2n2 unit_init_2xmm(6,7) +#define INIT_m2n6 INIT_m2n4 unit_init_2xmm(8,9) +#define INIT_m2n8 INIT_m2n6 unit_init_2xmm(10,11) +#define INIT_m2n10 INIT_m2n8 unit_init_2xmm(12,13) +#define INIT_m2n12 INIT_m2n10 unit_init_2xmm(14,15) + +#define KERNEL_k1m1n1 \ + "vmovsd (%1),%%xmm1; addq $8,%1;"\ + "vfmadd231sd (%0),%%xmm1,%%xmm4; addq $8,%0;" +#define KERNEL_h_k1m1n2 \ + "vmovddup (%0),%%xmm1; addq $8,%0;"\ + "vfmadd231pd (%1),%%xmm1,%%xmm4;" +#define KERNEL_k1m1n2 KERNEL_h_k1m1n2 "addq $16,%1;" +#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 "vfmadd231pd (%1,%%r12,1),%%xmm1,%%xmm5;" +#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $16,%1;" +#define KERNEL_k1m1n6 KERNEL_h_k1m1n4 "vfmadd231pd (%1,%%r12,2),%%xmm1,%%xmm6; addq $16,%1;" +#define KERNEL_h_k1m1n8 KERNEL_k1m1n6 "vfmadd231pd (%%r15),%%xmm1,%%xmm7;" +#define KERNEL_k1m1n8 KERNEL_h_k1m1n8 "addq $16,%%r15;" +#define KERNEL_h_k1m1n10 KERNEL_h_k1m1n8 "vfmadd231pd (%%r15,%%r12,1),%%xmm1,%%xmm8;" +#define KERNEL_k1m1n10 KERNEL_h_k1m1n10 "addq $16,%%r15;" +#define KERNEL_h_k1m1n12 KERNEL_h_k1m1n10 "vfmadd231pd (%%r15,%%r12,2),%%xmm1,%%xmm9;" +#define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;" +#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) + #define loada_kend_k1m1 "vmovddup (%0,%3,1),%%xmm1; addq $8,%3;" + #define acc_kend_nc2_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;" + #define acc_kend_nc3_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%1,%%r12,2),%%xmm1,%%xmm6;" + #define acc_kend_nc4_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15),%%xmm1,%%xmm7;" + #define acc_kend_nc5_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15,%%r12,1),%%xmm1,%%xmm8;" + #define acc_kend_nc6_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15,%%r12,2),%%xmm1,%%xmm9;" +#endif +#define save_init_m1 "movq %2,%3; addq $8,%2;" +#ifdef TRMMKERNEL + #define SAVE_m1n1 "vmulsd %%xmm4,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2); addq $8,%2;" + #define unit_save_m1n2(c1_no)\ + "vmulpd %%xmm"#c1_no",%%xmm0,%%xmm2; vmovsd %%xmm2,(%3); vmovhpd %%xmm2,(%3,%4,1); leaq (%3,%4,2),%3;" +#else + #define SAVE_m1n1 "vfmadd213sd (%2),%%xmm0,%%xmm4; vmovsd %%xmm4,(%2); addq $8,%2;" + #define unit_save_m1n2(c1_no)\ + "vmovsd (%3),%%xmm2; vmovhpd (%3,%4,1),%%xmm2,%%xmm2; vfmadd231pd %%xmm"#c1_no",%%xmm0,%%xmm2; vmovsd %%xmm2,(%3); vmovhpd %%xmm2,(%3,%4,1); leaq (%3,%4,2),%3;" +#endif +#define SAVE_m1n2 save_init_m1 unit_save_m1n2(4) +#define SAVE_m1n4 SAVE_m1n2 unit_save_m1n2(5) +#define SAVE_m1n6 SAVE_m1n4 unit_save_m1n2(6) +#define SAVE_m1n8 SAVE_m1n6 unit_save_m1n2(7) +#define SAVE_m1n10 SAVE_m1n8 unit_save_m1n2(8) +#define SAVE_m1n12 SAVE_m1n10 unit_save_m1n2(9) +#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define INIT_m1n2 INIT_m1n1 +#define INIT_m1n4 INIT_m1n2 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m1n6 INIT_m1n4 "vpxor %%xmm6,%%xmm6,%%xmm6;" +#define INIT_m1n8 INIT_m1n6 "vpxor %%xmm7,%%xmm7,%%xmm7;" +#define INIT_m1n10 INIT_m1n8 "vpxor %%xmm8,%%xmm8,%%xmm8;" +#define INIT_m1n12 INIT_m1n10 "vpxor %%xmm9,%%xmm9,%%xmm9;" + +#define COMPUTE_SIMPLE(mdim,ndim)\ + init_update_k(mdim) INIT_m##mdim##n##ndim "testq %%r13,%%r13; jz 7"#mdim"7"#ndim"9f;"\ + "movq %%r13,%5;" INIT_set_papb(mdim,ndim)\ + kernel_kstart_n##ndim(mdim,subq)\ + "7"#mdim"7"#ndim"1:\n\t"\ + KERNEL_k1m##mdim##n##ndim "decq %5; jnz 7"#mdim"7"#ndim"1b;"\ + "7"#mdim"7"#ndim"9:\n\t"\ + kernel_kend_n##ndim(mdim)\ + SAVE_set_pa(mdim) SAVE_m##mdim##n##ndim save_update_k(mdim) +#define COMPUTE_m16n1 COMPUTE_SIMPLE(16,1) +#define COMPUTE_m16n2 COMPUTE_SIMPLE(16,2) +#define COMPUTE_m16n4 COMPUTE_SIMPLE(16,4) +#define COMPUTE_m16n6 COMPUTE_SIMPLE(16,6) +#define COMPUTE_m16n8 COMPUTE_SIMPLE(16,8) +#define COMPUTE_m16n10 COMPUTE_SIMPLE(16,10) +#if defined(TRMMKERNEL) && !defined(LEFT) && defined(TRANSA) + #define INVERSE_K_MID "negq %5; leaq 6(%%r13,%5,1),%5;" +#else + #define INVERSE_K_MID "negq %5; leaq 16(%%r13,%5,1),%5;" +#endif +#define COMPUTE_m16n12 \ + init_update_k(16) INIT_m16n12 "movq %%r13,%5;" INIT_set_papb(16,12) "movq %2,%3;"\ + kernel_kstart_n12(16,subq)\ + "cmpq $16,%5; jb 7167123f; movq $16,%5;"\ + "7167121:\n\t"\ + KERNEL_k1m16n12 "addq $4,%5; testq $12,%5; movq $172,%%r10; cmovz %4,%%r10;"\ + KERNEL_k1m16n12 "prefetcht1 (%3); subq $129,%3; addq %%r10,%3;"\ + KERNEL_k1m16n12 "prefetcht1 (%6); addq $32,%6; cmpq $208,%5; cmoveq %2,%3;"\ + KERNEL_k1m16n12 "cmpq %5,%%r13; jnb 7167121b;"\ + "movq %2,%3;" INVERSE_K_MID\ + "7167123:\n\t"\ + "testq %5,%5; jz 7167129f;"\ + "7167125:\n\t"\ + "prefetcht0 (%3); prefetcht0 64(%3); prefetcht0 127(%3);"\ + KERNEL_k1m16n12 "addq %4,%3; decq %5;jnz 7167125b;"\ + "7167129:\n\t"\ + kernel_kend_n12(16)\ + "prefetcht0 (%%r14);" SAVE_set_pa(16) SAVE_m16n12 save_update_k(16) +#define COMPUTE(ndim) {\ + b_pref = b_ptr + ndim * K; HEAD_SET_OFF(ndim)\ + __asm__ __volatile__(\ + "vbroadcastsd %8,%%zmm0; movq %7,%%r11; movq %1,%%r14; movq %10,%%r12; salq $4,%%r12;" INITASM_SET_K\ + "cmpq $16,%%r11; jb "#ndim"33102f;"\ + #ndim"33101:\n\t"\ + COMPUTE_m16n##ndim "subq $16,%%r11; cmpq $16,%%r11; jnb "#ndim"33101b;"\ + #ndim"33102:\n\t"\ + "cmpq $8,%%r11; jb "#ndim"33103f;"\ + COMPUTE_SIMPLE(8,ndim) "subq $8,%%r11;"\ + #ndim"33103:\n\t"\ + "cmpq $4,%%r11; jb "#ndim"33104f;"\ + COMPUTE_SIMPLE(4,ndim) "subq $4,%%r11;"\ + #ndim"33104:\n\t"\ + "cmpq $2,%%r11; jb "#ndim"33105f;"\ + COMPUTE_SIMPLE(2,ndim) "subq $2,%%r11;"\ + #ndim"33105:\n\t"\ + "testq %%r11,%%r11; jz "#ndim"33106f;"\ + COMPUTE_SIMPLE(1,ndim) "subq $1,%%r11;"\ + #ndim"33106:\n\t"\ + "movq %%r14,%1;"\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(b_pref):"m"(M),"m"(ALPHA),"m"(off),"m"(K):"r10","r11","r12","r13","r14","r15","cc","memory",\ + "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15",\ + "zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31");\ + a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ndim * ldc - M; TAIL_SET_OFF(ndim)\ +} + +#include "common.h" +#include + +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif +) +{ + if(m==0||n==0||k==0||alpha==0.0) return 0; + int64_t ldc_in_bytes = (int64_t)ldc * sizeof(double); double ALPHA = alpha; + int64_t M = (int64_t)m, K = (int64_t)k, k_count = 0; + BLASLONG n_count = n, off = 0; + double *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*b_pref = B; +#ifdef TRMMKERNEL + #ifdef LEFT + off = offset; + #else + off = -offset; + #endif +#endif + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>9;n_count-=10) COMPUTE(10) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>5;n_count-=6) COMPUTE(6) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) COMPUTE(2) + if(n_count>0) COMPUTE(1) + return 0; +} + From f3f969f681ffc67f81ac78d37e51f83129232985 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 3 Feb 2020 21:34:12 +0800 Subject: [PATCH 02/12] Update param.h --- param.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/param.h b/param.h index 075c12ca2..219d99fc6 100644 --- a/param.h +++ b/param.h @@ -1660,14 +1660,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define SGEMM_DEFAULT_UNROLL_M 16 -#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 16 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_M 4 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_N 2 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 @@ -1701,12 +1701,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define SGEMM_DEFAULT_P 640 -#define DGEMM_DEFAULT_P 384 +#define DGEMM_DEFAULT_P 192 #define CGEMM_DEFAULT_P 384 #define ZGEMM_DEFAULT_P 256 #define SGEMM_DEFAULT_Q 320 -#define DGEMM_DEFAULT_Q 168 +#define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 192 #define ZGEMM_DEFAULT_Q 128 From 081b1885294afedf7f5ee87e1d9bd8b82d096664 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 3 Feb 2020 21:38:08 +0800 Subject: [PATCH 03/12] Update KERNEL.SKYLAKEX --- kernel/x86_64/KERNEL.SKYLAKEX | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 0e6275748..dcd201649 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -7,10 +7,13 @@ SGEMMITCOPY = sgemm_tcopy_16_skylakex.c SGEMMONCOPY = sgemm_ncopy_4_skylakex.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMKERNEL = dgemm_kernel_4x8_skylakex_2.c - -DGEMMONCOPY = dgemm_ncopy_8_skylakex.c -DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c +DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c +DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c +DGEMMINCOPY = ../generic/gemm_ncopy_16.c +DGEMMITCOPY = ../generic/gemm_tcopy_16.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c SGEMM_BETA = sgemm_beta_skylakex.c DGEMM_BETA = dgemm_beta_skylakex.c From 83b6be7976dd02973851c6df68e579a024aebfcc Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Tue, 4 Feb 2020 19:55:26 +0800 Subject: [PATCH 04/12] Update param.h --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 219d99fc6..e6ab93aa5 100644 --- a/param.h +++ b/param.h @@ -1711,7 +1711,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_Q 128 #define SGEMM_DEFAULT_R sgemm_r -#define DGEMM_DEFAULT_R 13824 +#define DGEMM_DEFAULT_R 8640 #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r From 1c3e20ce483c5b6a9bb457bd199c16a0b0bbd8de Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Tue, 4 Feb 2020 20:30:23 +0800 Subject: [PATCH 05/12] Update level3.c --- driver/level3/level3.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 1ab7a740e..9aa67286f 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -332,13 +332,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #else for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; - +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif START_RPCC(); From 77b8f49556096952fb4495b7019b58597f81dce8 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Tue, 4 Feb 2020 20:33:08 +0800 Subject: [PATCH 06/12] Update level3_thread.c --- driver/level3/level3_thread.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index cfbff7554..bf558447e 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -365,12 +365,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Split local region of B into parts */ for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){ min_jj = MIN(n_to, js + div_n) - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif /* Copy part of local region of B into workspace */ START_RPCC(); OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, From 833bd0f8ffd1b7921ca8e98196e40183158f8cb7 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 5 Feb 2020 10:09:41 +0800 Subject: [PATCH 07/12] Update trmm_L.c --- driver/level3/trmm_L.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/driver/level3/trmm_L.c b/driver/level3/trmm_L.c index 8a81d31a0..9117090b5 100644 --- a/driver/level3/trmm_L.c +++ b/driver/level3/trmm_L.c @@ -135,10 +135,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif START_RPCC(); GEMM_ONCOPY(min_l, min_jj, b + (jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); @@ -201,10 +205,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif START_RPCC(); GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); @@ -292,10 +300,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif START_RPCC(); GEMM_ONCOPY(min_l, min_jj, b + (m - min_l + jjs * ldb) * COMPSIZE, ldb, @@ -358,10 +370,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif START_RPCC(); GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, From 2f96a2c55b2cfae827973b3002ae5af8bfa6d7d6 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 5 Feb 2020 10:15:02 +0800 Subject: [PATCH 08/12] Update trmm_R.c --- driver/level3/trmm_R.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/driver/level3/trmm_R.c b/driver/level3/trmm_R.c index 0882aa496..62c6a2442 100644 --- a/driver/level3/trmm_R.c +++ b/driver/level3/trmm_R.c @@ -122,10 +122,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < ls - js; jjs += min_jj){ min_jj = ls - js - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + (js + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); #else @@ -142,10 +146,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < min_l; jjs += min_jj){ min_jj = min_l - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif #ifndef TRANSA TRMM_OLNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE); #else @@ -195,10 +203,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #else @@ -246,10 +258,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < min_l; jjs += min_jj){ min_jj = min_l - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif #ifndef TRANSA TRMM_OUNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE); #else @@ -267,10 +283,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){ min_jj = js - ls - min_l - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda, sb + min_l * (min_l + jjs) * COMPSIZE); @@ -324,10 +344,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #else From 096da2f51acc37d67e84646336753a77b4b007ad Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 5 Feb 2020 13:36:57 +0800 Subject: [PATCH 09/12] Update dgemm_kernel_16x2_skylakex.c --- kernel/x86_64/dgemm_kernel_16x2_skylakex.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c index 250ff8d49..743ad5aa7 100644 --- a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c @@ -1,3 +1,6 @@ +/* %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc(bytes), %5 = k_counter, %6 = b_pref */ +/* r10 = tmp, r11 = m_counter, r12 = size_of_1_tile_in_b, r13 = k, r14 = b_head, r15 = %1+3*r12 */ + #if (defined (LEFT) && !defined(TRANSA)) || (!defined (LEFT) && defined(TRANSA)) #define BACKWARDS 1 #else From 4e00d96a78b8b2d7b6cda9dafc9a72b6777d8828 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Thu, 6 Feb 2020 01:46:36 +0000 Subject: [PATCH 10/12] Update dgemm_kernel_16x2_skylakex.c --- kernel/x86_64/dgemm_kernel_16x2_skylakex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c index 743ad5aa7..4c4c2f4e4 100644 --- a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c @@ -467,7 +467,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, #endif ) { - if(m==0||n==0||k==0||alpha==0.0) return 0; + if(m==0||n==0) return 0; int64_t ldc_in_bytes = (int64_t)ldc * sizeof(double); double ALPHA = alpha; int64_t M = (int64_t)m, K = (int64_t)k, k_count = 0; BLASLONG n_count = n, off = 0; From 8b5cdcc64c0b5ed4fc58e9bd6a314759eb6c1294 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Thu, 6 Feb 2020 01:47:46 +0000 Subject: [PATCH 11/12] Update sgemm_kernel_8x4_haswell.c --- kernel/x86_64/sgemm_kernel_8x4_haswell.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sgemm_kernel_8x4_haswell.c b/kernel/x86_64/sgemm_kernel_8x4_haswell.c index 2b8aa9862..2250e4b97 100644 --- a/kernel/x86_64/sgemm_kernel_8x4_haswell.c +++ b/kernel/x86_64/sgemm_kernel_8x4_haswell.c @@ -467,7 +467,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f ,BLASLONG offset #endif ){ - if(m==0||n==0||k==0||alpha==0.0) return 0; + if(m==0||n==0) return 0; int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float); float constval = alpha; float *const_val=&constval; From 3447d04eaf3c98b4fcd1ba41fa13774ff15c72e3 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Thu, 6 Feb 2020 02:14:10 +0000 Subject: [PATCH 12/12] Update dgemm_kernel_16x2_skylakex.c --- kernel/x86_64/dgemm_kernel_16x2_skylakex.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c index 4c4c2f4e4..416ace59b 100644 --- a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c @@ -393,9 +393,10 @@ #define INIT_m1n12 INIT_m1n10 "vpxor %%xmm9,%%xmm9,%%xmm9;" #define COMPUTE_SIMPLE(mdim,ndim)\ - init_update_k(mdim) INIT_m##mdim##n##ndim "testq %%r13,%%r13; jz 7"#mdim"7"#ndim"9f;"\ + init_update_k(mdim) INIT_m##mdim##n##ndim\ "movq %%r13,%5;" INIT_set_papb(mdim,ndim)\ kernel_kstart_n##ndim(mdim,subq)\ + "testq %5,%5; jz 7"#mdim"7"#ndim"9f;"\ "7"#mdim"7"#ndim"1:\n\t"\ KERNEL_k1m##mdim##n##ndim "decq %5; jnz 7"#mdim"7"#ndim"1b;"\ "7"#mdim"7"#ndim"9:\n\t"\