From 844629af5702148b5eaee909472f4a80b368498d Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 16 Oct 2019 02:00:34 +0800 Subject: [PATCH 1/9] Add files via upload --- kernel/x86_64/KERNEL.SKYLAKEX | 4 +- kernel/x86_64/dgemm_kernel_8x8_skylakex.c | 811 ++++++++++++++++++++++ 2 files changed, 812 insertions(+), 3 deletions(-) create mode 100644 kernel/x86_64/dgemm_kernel_8x8_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index d61c51628..d73a47925 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -7,10 +7,8 @@ SGEMMITCOPY = sgemm_tcopy_16_skylakex.c SGEMMONCOPY = sgemm_ncopy_4_skylakex.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -#DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c +DGEMMKERNEL = dgemm_kernel_8x8_skylakex.c -#DGEMMINCOPY = dgemm_ncopy_8_skylakex.c -#DGEMMITCOPY = dgemm_tcopy_8_skylakex.c DGEMMONCOPY = dgemm_ncopy_8_skylakex.c DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c new file mode 100644 index 000000000..b4a87cbce --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c @@ -0,0 +1,811 @@ +#include "common.h" +#include +/* row-major c_block */ +/* 64-bit pointer registers: a_block_pointer,b_block_pointer,c_pointer;*/ +#define INNER_KERNEL_k1m1n8 \ + "prefetcht0 384(%1);"\ + "prefetcht0 768(%0); vmovupd (%1),%%zmm5; addq $64,%1;"\ + "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8;" + +#define INNER_KERNEL_k1m2n8 \ + INNER_KERNEL_k1m1n8\ + "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;" + +#define INNER_KERNEL_k1m4n8 \ + INNER_KERNEL_k1m2n8\ + "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;"\ + "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;" + +#define INNER_KERNEL_k1m8n8 \ + INNER_KERNEL_k1m4n8\ + "vbroadcastsd 32(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;"\ + "vbroadcastsd 40(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm13;"\ + "vbroadcastsd 48(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;"\ + "vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;" + +#define INNER_KERNEL_k1m1n16 \ + "prefetcht0 384(%1); prefetcht0 448(%1);"\ + "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd 64(%1),%%zmm6; addq $128,%1;"\ + "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;" + +#define INNER_KERNEL_k1m2n16 \ + INNER_KERNEL_k1m1n16\ + "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;" + +#define INNER_KERNEL_k1m4n16 \ + INNER_KERNEL_k1m2n16\ + "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;vfmadd231pd %%zmm6,%%zmm4,%%zmm13;"\ + "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;" + +#define INNER_KERNEL_k1m8n16 \ + INNER_KERNEL_k1m4n16\ + "vbroadcastsd 32(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm16;vfmadd231pd %%zmm6,%%zmm4,%%zmm17;"\ + "vbroadcastsd 40(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm18;vfmadd231pd %%zmm6,%%zmm4,%%zmm19;"\ + "vbroadcastsd 48(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;"\ + "vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;" + +#define INNER_KERNEL_k1m1n24 \ + "prefetcht0 384(%1); prefetcht0 448(%1); prefetcht0 512(%1);"\ + "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd 64(%1),%%zmm6; vmovupd 128(%1),%%zmm7; addq $192,%1;"\ + "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;" + +#define INNER_KERNEL_k1m2n24 \ + INNER_KERNEL_k1m1n24\ + "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;" + +#define INNER_KERNEL_k1m4n24 \ + INNER_KERNEL_k1m2n24\ + "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;vfmadd231pd %%zmm7,%%zmm4,%%zmm16;"\ + "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm17;vfmadd231pd %%zmm6,%%zmm4,%%zmm18;vfmadd231pd %%zmm7,%%zmm4,%%zmm19;" + +#define INNER_KERNEL_k1m8n24 \ + INNER_KERNEL_k1m4n24\ + "vbroadcastsd 32(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;vfmadd231pd %%zmm7,%%zmm4,%%zmm22;"\ + "vbroadcastsd 40(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm23;vfmadd231pd %%zmm6,%%zmm4,%%zmm24;vfmadd231pd %%zmm7,%%zmm4,%%zmm25;"\ + "vbroadcastsd 48(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm26;vfmadd231pd %%zmm6,%%zmm4,%%zmm27;vfmadd231pd %%zmm7,%%zmm4,%%zmm28;"\ + "vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm29;vfmadd231pd %%zmm6,%%zmm4,%%zmm30;vfmadd231pd %%zmm7,%%zmm4,%%zmm31;" + +#define INNER_KERNELm1(nn) \ + "cmpq $1,%2;jb "#nn"3f;"\ + #nn"4:\n\t"\ + INNER_KERNEL_k1m1n##nn "addq $8,%0;"\ + "decq %2;cmpq $1,%2;jnb "#nn"4b;"\ + #nn"3:\n\t" + +#define INNER_KERNELm2(nn) \ + "cmpq $1,%2;jb "#nn"0f;"\ + #nn"1:\n\t"\ + INNER_KERNEL_k1m2n##nn "addq $16,%0;"\ + "decq %2;cmpq $1,%2;jnb "#nn"1b;"\ + #nn"0:\n\t" + +#define INNER_KERNELm4(nn) \ + "cmpq $1,%2;jb "#nn"00f;"\ + #nn"01:\n\t"\ + INNER_KERNEL_k1m4n##nn "addq $32,%0;"\ + "decq %2;cmpq $1,%2;jnb "#nn"01b;"\ + #nn"00:\n\t" + +#define INNER_KERNELm8(nn) \ + "cmpq $8,%2;jb "#nn"001f;"\ + #nn"008:\n\t"\ + INNER_KERNEL_k1m8n##nn "addq $64,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%0;"\ + "subq $8,%2;cmpq $8,%2;jnb "#nn"008b;"\ + #nn"001:\n\t"\ + "cmpq $1,%2;jb "#nn"000f;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%0;"\ + "decq %2;cmpq $1,%2;jnb "#nn"001b;"\ + ""#nn"000:\n\t" + +#define INNER_INIT_m1n8 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8;" + +#define INNER_INIT_m2n8 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9;" + +#define INNER_INIT_m4n8 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;" + +#define INNER_INIT_m8n8 \ + INNER_INIT_m4n8\ + "vpxorq %%zmm12,%%zmm12,%%zmm12;vpxorq %%zmm13,%%zmm13,%%zmm13;vpxorq %%zmm14,%%zmm14,%%zmm14;vpxorq %%zmm15,%%zmm15,%%zmm15;" + +#define INNER_INIT_m1n16 INNER_INIT_m2n8 + +#define INNER_INIT_m2n16 INNER_INIT_m4n8 + +#define INNER_INIT_m4n16 INNER_INIT_m8n8 + +#define INNER_INIT_m8n16 \ + INNER_INIT_m8n8\ + "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"\ + "vpxorq %%zmm20,%%zmm20,%%zmm20;vpxorq %%zmm21,%%zmm21,%%zmm21;vpxorq %%zmm22,%%zmm22,%%zmm22;vpxorq %%zmm23,%%zmm23,%%zmm23;" + +#define INNER_INIT_m1n24 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;" + +#define INNER_INIT_m2n24 \ + INNER_INIT_m1n24\ + "vpxorq %%zmm11,%%zmm11,%%zmm11; vpxorq %%zmm12,%%zmm12,%%zmm12; vpxorq %%zmm13,%%zmm13,%%zmm13;" + +#define INNER_INIT_m4n24 \ + INNER_INIT_m4n16\ + "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;" + +#define INNER_INIT_m8n24 \ + INNER_INIT_m8n16\ + "vpxorq %%zmm24,%%zmm24,%%zmm24;vpxorq %%zmm25,%%zmm25,%%zmm25;vpxorq %%zmm26,%%zmm26,%%zmm26;vpxorq %%zmm27,%%zmm27,%%zmm27;"\ + "vpxorq %%zmm28,%%zmm28,%%zmm28;vpxorq %%zmm29,%%zmm29,%%zmm29;vpxorq %%zmm30,%%zmm30,%%zmm30;vpxorq %%zmm31,%%zmm31,%%zmm31;" + +#define INNER_SETINDEX \ + "vpinsrq $0,%4,%%xmm4,%%xmm4; vbroadcastsd %%xmm4,%%zmm4;"\ + "kxnorw %%k1,%%k1,%%k1; kshiftlw $1,%%k1,%%k1; vpxorq %%zmm6,%%zmm6,%%zmm6; vmovapd %%zmm4,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};" + +#define INNER_STORE_m1n8(c1,disp) \ + "kxnorw %%k1,%%k1,%%k1;"\ + "vgatherqpd "#disp"(%3,%%zmm6,1), %%zmm7 %{%%k1%};"\ + "vaddpd %%zmm7,"#c1","#c1";"\ + "kxnorw %%k1,%%k1,%%k1;"\ + "vscatterqpd "#c1", "#disp"(%3,%%zmm6,1) %{%%k1%};" + +#define INNER_SAVE_m1n8 \ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0) + +#define INNER_SAVE_m1n16 \ + INNER_SAVE_m1n8\ + "leaq (%3,%4,8),%3;"\ + INNER_STORE_m1n8(%%zmm9,0) + +#define INNER_SAVE_m1n24 \ + INNER_SAVE_m1n16\ + "leaq (%3,%4,8),%3;"\ + INNER_STORE_m1n8(%%zmm10,0) + +#define INNER_SAVE_m2n8 \ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0)\ + INNER_STORE_m1n8(%%zmm9,8) + +#define INNER_SAVE_m2n16 \ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0)\ + INNER_STORE_m1n8(%%zmm10,8)\ + "leaq (%3,%4,8),%3;"\ + INNER_STORE_m1n8(%%zmm9,0)\ + INNER_STORE_m1n8(%%zmm11,8) + +#define INNER_SAVE_m2n24 \ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0)\ + INNER_STORE_m1n8(%%zmm11,8)\ + "leaq (%3,%4,8),%3;"\ + INNER_STORE_m1n8(%%zmm9,0)\ + INNER_STORE_m1n8(%%zmm12,8)\ + "leaq (%3,%4,8),%3;"\ + INNER_STORE_m1n8(%%zmm10,0)\ + INNER_STORE_m1n8(%%zmm13,8) + +#define INNER_PREF_8x8 \ + "prefetcht0 (%3); prefetcht0 56(%3); prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2);"\ + "prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,2),%3;"\ + "prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,1),%3;"\ + "prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4);"\ + "subq %4,%3; subq %4,%3; subq %4,%3;" + +#define INNER_TRANS_4x8(c1,c2,c3,c4) \ + "vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\ + "vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\ + "vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\ + "vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\ + "vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\ + "vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";" + +#define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ + INNER_TRANS_4x8(c1,c2,c3,c4)\ + INNER_TRANS_4x8(c5,c6,c7,c8)\ + "vblendmpd "#c5","#c1",%%zmm4%{%5%};vshuff64x2 $0x4e,%%zmm4,%%zmm4,%%zmm4;"\ + "vblendmpd "#c1",%%zmm4,"#c1"%{%5%};vblendmpd %%zmm4,"#c5","#c5"%{%5%};"\ + "vblendmpd "#c6","#c2",%%zmm5%{%5%};vshuff64x2 $0x4e,%%zmm5,%%zmm5,%%zmm5;"\ + "vblendmpd "#c2",%%zmm5,"#c2"%{%5%};vblendmpd %%zmm5,"#c6","#c6"%{%5%};"\ + "vblendmpd "#c7","#c3",%%zmm6%{%5%};vshuff64x2 $0x4e,%%zmm6,%%zmm6,%%zmm6;"\ + "vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd %%zmm6,"#c7","#c7"%{%5%};"\ + "vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\ + "vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};" + +#define INNER_STORE_4x8(c1,c2,c3,c4) \ + "vmovupd (%3),%%zmm4%{%5%};vmovupd -32(%3,%4,4),%%zmm4%{%7%};vaddpd %%zmm4,"#c1","#c1";"\ + "vmovupd "#c1",(%3)%{%5%}; vmovupd "#c1",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ + "vmovupd (%3),%%zmm5%{%5%};vmovupd -32(%3,%4,4),%%zmm5%{%7%};vaddpd %%zmm5,"#c2","#c2";"\ + "vmovupd "#c2",(%3)%{%5%}; vmovupd "#c2",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ + "vmovupd (%3),%%zmm6%{%5%};vmovupd -32(%3,%4,4),%%zmm6%{%7%};vaddpd %%zmm6,"#c3","#c3";"\ + "vmovupd "#c3",(%3)%{%5%}; vmovupd "#c3",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ + "vmovupd (%3),%%zmm7%{%5%};vmovupd -32(%3,%4,4),%%zmm7%{%7%};vaddpd %%zmm7,"#c4","#c4";"\ + "vmovupd "#c4",(%3)%{%5%}; vmovupd "#c4",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ + "leaq (%3,%4,4),%3;" + +#define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ + "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ + "vaddpd (%3),"#c1","#c1"; vmovupd "#c1",(%3); vaddpd (%3,%4,1),"#c2","#c2"; vmovupd "#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ + "vaddpd (%3),"#c3","#c3"; vmovupd "#c3",(%3); vaddpd (%3,%4,1),"#c4","#c4"; vmovupd "#c4",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ + "vaddpd (%3),"#c5","#c5"; vmovupd "#c5",(%3); vaddpd (%3,%4,1),"#c6","#c6"; vmovupd "#c6",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ + "vaddpd (%3),"#c7","#c7"; vmovupd "#c7",(%3); vaddpd (%3,%4,1),"#c8","#c8"; vmovupd "#c8",(%3,%4,1); leaq (%3,%4,2),%3;" + +#define INNER_SAVE_m4n8 \ + INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\ + INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11) + +#define INNER_SAVE_m4n16 \ + INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\ + INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\ + INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\ + INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15) + +#define INNER_SAVE_m4n24 \ + INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\ + INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\ + INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\ + INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\ + INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\ + INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19) + +#define INNER_SAVE_m8n8 \ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ + INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15) + +#define INNER_SAVE_m8n16 \ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\ + INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\ + INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23) + +#define INNER_SAVE_m8n24 \ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\ + INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\ + INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\ + INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31) + +#define COMPUTE_m1n8 {\ + __asm__ __volatile__(\ + INNER_INIT_m1n8\ + INNER_KERNELm1(8)\ + INNER_SAVE_m1n8\ + :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes)\ + :"zmm4","zmm5","zmm6","zmm7","zmm8","cc","memory","k1");\ + c_pointer += 1;\ +} +#define COMPUTE_m2n8 {\ + __asm__ __volatile__(\ + INNER_INIT_m2n8\ + INNER_KERNELm2(8)\ + INNER_SAVE_m2n8\ + :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes)\ + :"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","cc","memory","k1");\ + c_pointer += 2;\ +} +#define COMPUTE_m4n8 {\ + __asm__ __volatile__(\ + INNER_INIT_m4n8\ + INNER_KERNELm4(8)\ + INNER_SAVE_m4n8\ + :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes),"Yk"(k02),"Yk"(k03),"Yk"(k01)\ + :"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","cc","memory");\ + c_pointer += 4;\ +} +#define COMPUTE_m8n8 {\ + __asm__ __volatile__(\ + INNER_INIT_m8n8\ + INNER_KERNELm8(8)\ + INNER_SAVE_m8n8\ + :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes),"Yk"(k02),"Yk"(k03)\ + :"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory");\ + c_pointer += 8;\ +} + +#define COMPUTE_n8 {\ + __asm__ __volatile__(\ + "movq %8,%%r14;movq %2,%%r13;"\ + "cmpq $8,%8; jb 42222f;"\ + "42221:\n\t"\ + INNER_INIT_m8n8\ + INNER_KERNELm8(8)\ + INNER_SAVE_m8n8\ + "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\ + "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $64,%3;"\ + "subq $8,%8; cmpq $8,%8; jnb 42221b;"\ + "42222:\n\t"\ + "cmpq $4,%8; jb 42223f;"\ + INNER_INIT_m4n8\ + INNER_KERNELm4(8)\ + INNER_SAVE_m4n8\ + "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\ + "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $32,%3;"\ + "subq $4,%8;"\ + "42223:\n\t"\ + "cmpq $2,%8; jb 42224f;"\ + INNER_INIT_m2n8\ + INNER_KERNELm2(8)\ + INNER_SAVE_m2n8\ + "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\ + "addq $16,%3;"\ + "subq $2,%8;"\ + "42224:\n\t"\ + "cmpq $1,%8; jb 42225f;"\ + INNER_INIT_m1n8\ + INNER_KERNELm1(8)\ + INNER_SAVE_m1n8\ + "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\ + "addq $8,%3;"\ + "42225:\n\t"\ + "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ + "shlq $3,%4;addq %4,%3;shrq $3,%4;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\ + ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r13","r14");\ + a_block_pointer -= M * K;\ +} +#define COMPUTE_n16 {\ + __asm__ __volatile__(\ + "movq %8,%%r14;movq %2,%%r13;"\ + "cmpq $8,%8; jb 32222f;"\ + "32221:\n\t"\ + INNER_INIT_m8n16\ + INNER_KERNELm8(16)\ + INNER_SAVE_m8n16\ + "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\ + "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\ + "subq $8,%8; cmpq $8,%8; jnb 32221b;"\ + "32222:\n\t"\ + "cmpq $4,%8; jb 32223f;"\ + INNER_INIT_m4n16\ + INNER_KERNELm4(16)\ + INNER_SAVE_m4n16\ + "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\ + "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\ + "subq $4,%8;"\ + "32223:\n\t"\ + "cmpq $2,%8; jb 32224f;"\ + INNER_INIT_m2n16\ + INNER_KERNELm2(16)\ + INNER_SAVE_m2n16\ + "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\ + "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $16,%3;"\ + "subq $2,%8;"\ + "32224:\n\t"\ + "cmpq $1,%8; jb 32225f;"\ + INNER_INIT_m1n16\ + INNER_KERNELm1(16)\ + INNER_SAVE_m1n16\ + "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\ + "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $8,%3;"\ + "32225:\n\t"\ + "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ + "shlq $4,%4;addq %4,%3;shrq $4,%4;"\ + :"+r"(a_block_pointer),"+r"(b_scratch),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\ + ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ + "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r13","r14");\ + a_block_pointer -= M * K;\ +} +#define COMPUTE_n24 {\ + __asm__ __volatile__(\ + "movq %8,%%r14;movq %9,%%r15;movq %2,%%r13;"\ + "cmpq $8,%8; jb 22222f;"\ + "22221:\n\t"\ + INNER_INIT_m8n24\ + "prefetcht2 (%%r15); prefetcht2 64(%%r15);"\ + INNER_KERNELm8(24)\ + "prefetcht2 128(%%r15); prefetcht2 192(%%r15);"\ + INNER_SAVE_m8n24\ + "prefetcht2 256(%%r15); prefetcht2 320(%%r15); addq $384,%%r15;"\ + "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\ + "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\ + "subq $8,%8; cmpq $8,%8; jnb 22221b;"\ + "22222:\n\t"\ + "cmpq $4,%8; jb 22223f;"\ + INNER_INIT_m4n24\ + INNER_KERNELm4(24)\ + INNER_SAVE_m4n24\ + "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\ + "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\ + "subq $4,%8;"\ + "22223:\n\t"\ + "cmpq $2,%8; jb 22224f;"\ + INNER_INIT_m2n24\ + INNER_KERNELm2(24)\ + INNER_SAVE_m2n24\ + "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\ + "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $16,%3;"\ + "subq $2,%8;"\ + "22224:\n\t"\ + "cmpq $1,%8; jb 22225f;"\ + INNER_INIT_m1n24\ + INNER_KERNELm1(24)\ + INNER_SAVE_m1n24\ + "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\ + "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $8,%3;"\ + "22225:\n\t"\ + "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ + "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\ + :"+r"(a_block_pointer),"+r"(b_scratch),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),\ + "+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(packed_b_pointer)\ + ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\ + "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r13","r14","r15");\ + a_block_pointer -= M * K;\ +} + +static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8 +//perform C += A B + if(k==0 || m==0 || ndiv8==0) return; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double); + int64_t K = (int64_t)k; int64_t M = (int64_t)m; + double *a_block_pointer; + double *c_pointer = c; + __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033; + BLASLONG ndiv8_count; + double *b_scratch = (double *)aligned_alloc(64,192*k); + double *packed_b_pointer = packed_b; + a_block_pointer = packed_a; + for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){ + __asm__ __volatile__ ( + "testq %2,%2; jz 100002f;movq %2,%%r13;shlq $6,%%r13;" + "100001:\n\t" + "vmovupd (%0),%%zmm5; vmovupd (%0,%%r13,1),%%zmm6; vmovupd (%0,%%r13,2),%%zmm7; addq $64,%0;" + "vmovupd %%zmm5,(%1); vmovupd %%zmm6,64(%1); vmovupd %%zmm7,128(%1); addq $192,%1;" + "decq %2; testq %2,%2; jnz 100001b;" + "100002:\n\t" + "movq %%r13,%2;shrq $6,%2;leaq (%0,%%r13,2),%0;subq %%r13,%1;subq %%r13,%1;subq %%r13,%1;" + :"+r"(packed_b_pointer),"+r"(b_scratch),"+r"(K)::"r13","cc","memory","zmm5","zmm6","zmm7"); + COMPUTE_n24 + } + for(;ndiv8_count>1;ndiv8_count-=2){ + __asm__ __volatile__ ( + "testq %2,%2; jz 1000002f;movq %2,%%r13;shlq $6,%%r13;" + "1000001:\n\t" + "vmovupd (%0),%%zmm5; vmovupd (%0,%%r13,1),%%zmm6; addq $64,%0;" + "vmovupd %%zmm5,(%1); vmovupd %%zmm6,64(%1); addq $128,%1;" + "decq %2; testq %2,%2; jnz 1000001b;" + "1000002:\n\t" + "movq %%r13,%2;shrq $6,%2;leaq (%0,%%r13,1),%0;subq %%r13,%1;subq %%r13,%1;" + :"+r"(packed_b_pointer),"+r"(b_scratch),"+r"(K)::"r13","cc","memory","zmm5","zmm6"); + COMPUTE_n16 + } + if(ndiv8_count>0){ + COMPUTE_n8 + } + free(b_scratch);b_scratch=NULL; +} + +/* __m512d accumulators: zc1-zc4; temporary variables: za1,zb1-zb2 */ +/* __m256d accumulators: yc1-yc4; temporary variables: ya1,yb1-yb2 */ +/* __m128d accumulators: xc1-xc4; temporary variables: xa1,xb1-xb2 */ +/* double accumulator: sc1; temporary variables: sa1,sb1 */ +/* column-major c_block */ +#define KERNEL_m8n4k1 {\ + __asm__ __volatile__(\ + "vmovupd (%0),%2; addq $64,%0;"\ + "vbroadcastsd (%1),%3; vfmadd231pd %2,%3,%5; "\ + "vbroadcastsd 8(%1),%4; vfmadd231pd %2,%4,%6; "\ + "vbroadcastsd 16(%1),%3; vfmadd231pd %2,%3,%7; "\ + "vbroadcastsd 24(%1),%4; vfmadd231pd %2,%4,%8; "\ + "addq $32,%1;"\ + :"+r"(a_block_pointer),"+r"(b_block_pointer),"+v"(za1),"+v"(zb1),"+v"(zb2),"+v"(zc1),"+v"(zc2),"+v"(zc3),"+v"(zc4)::"cc","memory");\ +} +#define KERNEL_m8n2k1 {\ + __asm__ __volatile__(\ + "vmovupd (%0),%2; addq $64,%0;"\ + "vbroadcastsd (%1),%3; vfmadd231pd %2,%3,%5; "\ + "vbroadcastsd 8(%1),%4; vfmadd231pd %2,%4,%6; "\ + "addq $16,%1;"\ + :"+r"(a_block_pointer),"+r"(b_block_pointer),"+v"(za1),"+v"(zb1),"+v"(zb2),"+v"(zc1),"+v"(zc2)::"cc","memory");\ +} +#define KERNEL_m8n1k1 {\ + __asm__ __volatile__(\ + "vmovupd (%0),%2; addq $64,%0;"\ + "vbroadcastsd (%1),%3; vfmadd231pd %2,%3,%4; "\ + "addq $8,%1;"\ + :"+r"(a_block_pointer),"+r"(b_block_pointer),"+v"(za1),"+v"(zb1),"+v"(zc1)::"cc","memory");\ +} +#define INIT_m8n1 zc1=_mm512_setzero_pd(); +#define INIT_m8n2 zc2=INIT_m8n1 +#define INIT_m8n4 zc4=zc3=INIT_m8n2 +#define SAVE_m8n1 {\ + za1 = _mm512_loadu_pd(c_pointer);\ + zc1 = _mm512_add_pd(zc1,za1);\ + _mm512_storeu_pd(c_pointer,zc1);\ + c_pointer += 8;\ +} +#define SAVE_m8n2 {\ + zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\ + zc1 = _mm512_add_pd(zc1,zb1); zc2 = _mm512_add_pd(zc2,zb2);\ + _mm512_storeu_pd(c_pointer,zc1); _mm512_storeu_pd(c_pointer+LDC,zc2);\ + c_pointer += 8;\ +} +#define SAVE_m8n4 {\ + zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\ + zc1 = _mm512_add_pd(zc1,zb1); zc2 = _mm512_add_pd(zc2,zb2);\ + _mm512_storeu_pd(c_pointer,zc1); _mm512_storeu_pd(c_pointer+LDC,zc2);\ + c_pointer += LDC*2;\ + zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\ + zc3 = _mm512_add_pd(zc3,zb1); zc4 = _mm512_add_pd(zc4,zb2);\ + _mm512_storeu_pd(c_pointer,zc3); _mm512_storeu_pd(c_pointer+LDC,zc4);\ + c_pointer += 8-LDC*2;\ +} +#define KERNEL_m4n4k1 {\ + ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ + yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\ + yb1 = _mm256_broadcast_sd(b_block_pointer+2); yc3 = _mm256_fmadd_pd(ya1,yb1,yc3);\ + yb2 = _mm256_broadcast_sd(b_block_pointer+3); yc4 = _mm256_fmadd_pd(ya1,yb2,yc4);\ + b_block_pointer+=4;\ +} +#define KERNEL_m4n2k1 {\ + ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ + yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\ + b_block_pointer+=2;\ +} +#define KERNEL_m4n1k1 {\ + ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ + yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + b_block_pointer++;\ +} +#define INIT_m4n1 yc1=_mm256_setzero_pd(); +#define INIT_m4n2 yc2=INIT_m4n1 +#define INIT_m4n4 yc4=yc3=INIT_m4n2 +#define SAVE_m4n1 {\ + ya1 = _mm256_loadu_pd(c_pointer);\ + yc1 = _mm256_add_pd(yc1,ya1);\ + _mm256_storeu_pd(c_pointer,yc1);\ + c_pointer += 4;\ +} +#define SAVE_m4n2 {\ + yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ + yc1 = _mm256_add_pd(yc1,yb1); yc2 = _mm256_add_pd(yc2,yb2);\ + _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ + c_pointer += 4;\ +} +#define SAVE_m4n4 {\ + yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ + yc1 = _mm256_add_pd(yc1,yb1); yc2 = _mm256_add_pd(yc2,yb2);\ + _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ + c_pointer += LDC*2;\ + yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ + yc3 = _mm256_add_pd(yc3,yb1); yc4 = _mm256_add_pd(yc4,yb2);\ + _mm256_storeu_pd(c_pointer,yc3); _mm256_storeu_pd(c_pointer+LDC,yc4);\ + c_pointer += 4-LDC*2;\ +} +#define KERNEL_m2n2k1 {\ + xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\ + xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ + xb2 = _mm_loaddup_pd(b_block_pointer+1); xc2 = _mm_fmadd_pd(xa1,xb2,xc2);\ + b_block_pointer += 2;\ +} +#define KERNEL_m2n1k1 {\ + xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\ + xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ + b_block_pointer ++;\ +} +#define INIT_m2n1 xc1=_mm_setzero_pd(); +#define INIT_m2n2 xc2=INIT_m2n1 +#define SAVE_m2n1 {\ + xa1 = _mm_loadu_pd(c_pointer);\ + xc1 = _mm_add_pd(xc1,xa1);\ + _mm_storeu_pd(c_pointer,xc1);\ + c_pointer += 2;\ +} +#define SAVE_m2n2 {\ + xb1 = _mm_loadu_pd(c_pointer); xb2 = _mm_loadu_pd(c_pointer+LDC);\ + xc1 = _mm_add_pd(xc1,xb1); xc2 = _mm_add_pd(xc2,xb2);\ + _mm_storeu_pd(c_pointer,xc1); _mm_storeu_pd(c_pointer+LDC,xc2);\ + c_pointer += 2;\ +} +#define KERNEL_m1n1k1 {\ + sa1 = *a_block_pointer; a_block_pointer++;\ + sb1 = *b_block_pointer; sc1 += sa1 * sb1;\ + b_block_pointer ++;\ +} +#define INIT_m1n1 sc1=0.0; +#define SAVE_m1n1 {\ + *c_pointer += sc1;\ + c_pointer++;\ +} + +/* row-major c_block */ +#define KERNEL_m2n4k1 {\ + yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\ + ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + ya1 = _mm256_broadcast_sd(a_block_pointer+1);yc2 = _mm256_fmadd_pd(ya1,yb1,yc2);\ + a_block_pointer += 2;\ +} +#define KERNEL_m1n4k1 {\ + yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\ + ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + a_block_pointer ++;\ +} +#define KERNEL_m1n2k1 {\ + xb1 = _mm_loadu_pd(b_block_pointer);b_block_pointer+=2;\ + xa1 = _mm_loaddup_pd(a_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ + a_block_pointer ++;\ +} +#define INIT_m1n2 INIT_m2n1 +#define INIT_m1n4 INIT_m4n1 +#define INIT_m2n4 INIT_m4n2 +#define SAVE_m2n4 {\ + yb1 = _mm256_unpacklo_pd(yc1,yc2);\ + yb2 = _mm256_unpackhi_pd(yc1,yc2);\ + xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer),_mm256_extractf128_pd(yb1,0));\ + xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+LDC),_mm256_extractf128_pd(yb2,0));\ + _mm_storeu_pd(c_pointer,xb1);\ + _mm_storeu_pd(c_pointer+LDC,xb2);\ + xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer+2*LDC),_mm256_extractf128_pd(yb1,1));\ + xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+3*LDC),_mm256_extractf128_pd(yb2,1));\ + _mm_storeu_pd(c_pointer+2*LDC,xb1);\ + _mm_storeu_pd(c_pointer+3*LDC,xb2);\ + c_pointer += 2;\ +} +#define SAVE_m1n2 {\ + *c_pointer += _mm_cvtsd_f64(xc1);\ + xa1 = _mm_unpackhi_pd(xc1,xc1);\ + c_pointer[LDC]+= _mm_cvtsd_f64(xa1);\ + c_pointer ++;\ +} +#define SAVE_m1n4 {\ + *c_pointer += _mm256_cvtsd_f64(yc1);\ + ya1 = _mm256_unpackhi_pd(yc1,yc1);\ + c_pointer[LDC] += _mm256_cvtsd_f64(ya1);\ + xb1 = _mm256_extractf128_pd(yc1,1);\ + c_pointer[LDC*2] += _mm_cvtsd_f64(xb1);\ + xb2 = _mm_unpackhi_pd(xb1,xb1);\ + c_pointer[LDC*3] += _mm_cvtsd_f64(xb2);\ + c_pointer ++;\ +} + +static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8 +//perform C += A B , edge_n<8 must be satisfied ! + if(k==0 || m==0 || edge_n==0) return; + double *a_block_pointer,*b_block_pointer,*b_base_pointer; + double *c_pointer = c; + __m512d zb1,zb2,za1,zc1,zc2,zc3,zc4; + __m256d yc1,yc2,yc3,yc4,ya1,yb1,yb2; + __m128d xc1,xc2,xa1,xb1,xb2; + double sc1,sa1,sb1; + BLASLONG m_count,n_count,k_count; + b_base_pointer = packed_b; +//now start calculation of the edge part + for(n_count=edge_n;n_count>3;n_count-=4){ + a_block_pointer = packed_a; + for(m_count=m;m_count>7;m_count-=8){ + b_block_pointer = b_base_pointer; + INIT_m8n4 + for(k_count=0;k_count3;m_count-=4){ + b_block_pointer = b_base_pointer; + INIT_m4n4 + for(k_count=0;k_count1;m_count-=2){ + b_block_pointer = b_base_pointer; + INIT_m2n4 + for(k_count=0;k_count0){ + b_block_pointer = b_base_pointer; + INIT_m1n4 + for(k_count=0;k_count1;n_count-=2){ + a_block_pointer = packed_a; + for(m_count=m;m_count>7;m_count-=8){ + b_block_pointer = b_base_pointer; + INIT_m8n2 + for(k_count=0;k_count3;m_count-=4){ + b_block_pointer = b_base_pointer; + INIT_m4n2 + for(k_count=0;k_count1;m_count-=2){ + b_block_pointer = b_base_pointer; + INIT_m2n2 + for(k_count=0;k_count0){ + b_block_pointer = b_base_pointer; + INIT_m1n2 + for(k_count=0;k_count0){ + a_block_pointer = packed_a; + for(m_count=m;m_count>7;m_count-=8){ + b_block_pointer = b_base_pointer; + INIT_m8n1 + for(k_count=0;k_count3;m_count-=4){ + b_block_pointer = b_base_pointer; + INIT_m4n1 + for(k_count=0;k_count1;m_count-=2){ + b_block_pointer = b_base_pointer; + INIT_m2n1 + for(k_count=0;k_count0){ + b_block_pointer = b_base_pointer; + INIT_m1n1 + for(k_count=0;k_count7;m_count-=8){ + for(k_count=k;k_count>0;k_count--){ + tmp = _mm256_loadu_pd(src1);tmp = _mm256_mul_pd(tmp,alp);_mm256_storeu_pd(dst1+0,tmp);src1+=4; + tmp = _mm256_loadu_pd(src2);tmp = _mm256_mul_pd(tmp,alp);_mm256_storeu_pd(dst1+4,tmp);src2+=4; + dst1+=8; + } + src1+=4*k;src2+=4*k; + } + for(;m_count>0;m_count--){ + for(k_count=k;k_count>0;k_count--){ + *dst1 = (*src1) * alpha; src1++; dst1++; + } + } +} +int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc){ + if(m==0 || n==0 || k==0) return 0; + BLASLONG ndiv8 = n/8; + double *packed_a = (double *)malloc(m*k*sizeof(double)); + copy_4_to_8(A,packed_a,m,k,alpha); + if(ndiv8>0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C); + if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8); + free(packed_a);packed_a=NULL; + return 0; +} + From 5da9484d932cd220934207f83ff111df248bba7f Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 16 Oct 2019 02:01:13 +0800 Subject: [PATCH 2/9] Add files via upload --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 0ff59f400..860106991 100644 --- a/param.h +++ b/param.h @@ -1700,7 +1700,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_Q 128 #else #define SGEMM_DEFAULT_Q 384 -#define DGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 128 #endif #define CGEMM_DEFAULT_Q 192 #define ZGEMM_DEFAULT_Q 128 From 6bd67ddbab5ef752e1cafca4e4b7b66ecbb57452 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 16 Oct 2019 03:20:08 +0800 Subject: [PATCH 3/9] Update dgemm_kernel_8x8_skylakex.c --- kernel/x86_64/dgemm_kernel_8x8_skylakex.c | 38 +---------------------- 1 file changed, 1 insertion(+), 37 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c index b4a87cbce..69437e665 100644 --- a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c @@ -1,4 +1,5 @@ #include "common.h" +#include #include /* row-major c_block */ /* 64-bit pointer registers: a_block_pointer,b_block_pointer,c_pointer;*/ @@ -289,43 +290,6 @@ INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\ INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31) -#define COMPUTE_m1n8 {\ - __asm__ __volatile__(\ - INNER_INIT_m1n8\ - INNER_KERNELm1(8)\ - INNER_SAVE_m1n8\ - :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes)\ - :"zmm4","zmm5","zmm6","zmm7","zmm8","cc","memory","k1");\ - c_pointer += 1;\ -} -#define COMPUTE_m2n8 {\ - __asm__ __volatile__(\ - INNER_INIT_m2n8\ - INNER_KERNELm2(8)\ - INNER_SAVE_m2n8\ - :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes)\ - :"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","cc","memory","k1");\ - c_pointer += 2;\ -} -#define COMPUTE_m4n8 {\ - __asm__ __volatile__(\ - INNER_INIT_m4n8\ - INNER_KERNELm4(8)\ - INNER_SAVE_m4n8\ - :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes),"Yk"(k02),"Yk"(k03),"Yk"(k01)\ - :"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","cc","memory");\ - c_pointer += 4;\ -} -#define COMPUTE_m8n8 {\ - __asm__ __volatile__(\ - INNER_INIT_m8n8\ - INNER_KERNELm8(8)\ - INNER_SAVE_m8n8\ - :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes),"Yk"(k02),"Yk"(k03)\ - :"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory");\ - c_pointer += 8;\ -} - #define COMPUTE_n8 {\ __asm__ __volatile__(\ "movq %8,%%r14;movq %2,%%r13;"\ From 9b19e9e1b01c6820aaff0c7683ac6b317c9dfaba Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 16 Oct 2019 10:14:51 +0800 Subject: [PATCH 4/9] Update dgemm_kernel_8x8_skylakex.c --- kernel/x86_64/dgemm_kernel_8x8_skylakex.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c index 69437e665..1db955776 100644 --- a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c @@ -429,7 +429,8 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG double *c_pointer = c; __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033; BLASLONG ndiv8_count; - double *b_scratch = (double *)aligned_alloc(64,192*k); + double *b_scratch; + posix_memalign(&b_scratch,64,192*k); double *packed_b_pointer = packed_b; a_block_pointer = packed_a; for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){ @@ -637,9 +638,10 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG c_pointer ++;\ } #define SAVE_m1n4 {\ - *c_pointer += _mm256_cvtsd_f64(yc1);\ - ya1 = _mm256_unpackhi_pd(yc1,yc1);\ - c_pointer[LDC] += _mm256_cvtsd_f64(ya1);\ + xb1 = _mm256_extractf128_pd(yc1,0);\ + *c_pointer += _mm_cvtsd_f64(xb1);\ + xb2 = _mm_unpackhi_pd(xb1,xb1);\ + c_pointer[LDC] += _mm_cvtsd_f64(xb2);\ xb1 = _mm256_extractf128_pd(yc1,1);\ c_pointer[LDC*2] += _mm_cvtsd_f64(xb1);\ xb2 = _mm_unpackhi_pd(xb1,xb1);\ From b7315f8401089a91ae382b87be7e2683745828da Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 16 Oct 2019 19:23:36 +0800 Subject: [PATCH 5/9] Add files via upload --- kernel/x86_64/dgemm_kernel_8x8_skylakex.c | 75 ++++++++--------------- 1 file changed, 26 insertions(+), 49 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c index 1db955776..b8b3234d1 100644 --- a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c @@ -25,8 +25,8 @@ "vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;" #define INNER_KERNEL_k1m1n16 \ - "prefetcht0 384(%1); prefetcht0 448(%1);"\ - "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd 64(%1),%%zmm6; addq $128,%1;"\ + "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,1);"\ + "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,1),%%zmm6; addq $64,%1;"\ "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;" #define INNER_KERNEL_k1m2n16 \ @@ -46,8 +46,8 @@ "vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;" #define INNER_KERNEL_k1m1n24 \ - "prefetcht0 384(%1); prefetcht0 448(%1); prefetcht0 512(%1);"\ - "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd 64(%1),%%zmm6; vmovupd 128(%1),%%zmm7; addq $192,%1;"\ + "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,1); prefetcht0 128(%1,%%r12,2);"\ + "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,1),%%zmm6; vmovupd (%1,%%r12,2),%%zmm7; addq $64,%1;"\ "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;" #define INNER_KERNEL_k1m2n24 \ @@ -292,13 +292,13 @@ #define COMPUTE_n8 {\ __asm__ __volatile__(\ - "movq %8,%%r14;movq %2,%%r13;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\ "cmpq $8,%8; jb 42222f;"\ "42221:\n\t"\ INNER_INIT_m8n8\ INNER_KERNELm8(8)\ INNER_SAVE_m8n8\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $64,%3;"\ "subq $8,%8; cmpq $8,%8; jnb 42221b;"\ "42222:\n\t"\ @@ -306,7 +306,7 @@ INNER_INIT_m4n8\ INNER_KERNELm4(8)\ INNER_SAVE_m4n8\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $32,%3;"\ "subq $4,%8;"\ "42223:\n\t"\ @@ -314,7 +314,7 @@ INNER_INIT_m2n8\ INNER_KERNELm2(8)\ INNER_SAVE_m2n8\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "addq $16,%3;"\ "subq $2,%8;"\ "42224:\n\t"\ @@ -322,7 +322,7 @@ INNER_INIT_m1n8\ INNER_KERNELm1(8)\ INNER_SAVE_m1n8\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "addq $8,%3;"\ "42225:\n\t"\ "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ @@ -333,13 +333,13 @@ } #define COMPUTE_n16 {\ __asm__ __volatile__(\ - "movq %8,%%r14;movq %2,%%r13;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\ "cmpq $8,%8; jb 32222f;"\ "32221:\n\t"\ INNER_INIT_m8n16\ INNER_KERNELm8(16)\ INNER_SAVE_m8n16\ - "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\ "subq $8,%8; cmpq $8,%8; jnb 32221b;"\ "32222:\n\t"\ @@ -347,7 +347,7 @@ INNER_INIT_m4n16\ INNER_KERNELm4(16)\ INNER_SAVE_m4n16\ - "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\ "subq $4,%8;"\ "32223:\n\t"\ @@ -355,7 +355,7 @@ INNER_INIT_m2n16\ INNER_KERNELm2(16)\ INNER_SAVE_m2n16\ - "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $16,%3;"\ "subq $2,%8;"\ "32224:\n\t"\ @@ -363,28 +363,26 @@ INNER_INIT_m1n16\ INNER_KERNELm1(16)\ INNER_SAVE_m1n16\ - "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $8,%3;"\ "32225:\n\t"\ "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ "shlq $4,%4;addq %4,%3;shrq $4,%4;"\ - :"+r"(a_block_pointer),"+r"(b_scratch),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\ + "leaq (%1,%%r12,2),%1;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\ ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ - "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r13","r14");\ + "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } #define COMPUTE_n24 {\ __asm__ __volatile__(\ - "movq %8,%%r14;movq %9,%%r15;movq %2,%%r13;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\ "cmpq $8,%8; jb 22222f;"\ "22221:\n\t"\ INNER_INIT_m8n24\ - "prefetcht2 (%%r15); prefetcht2 64(%%r15);"\ INNER_KERNELm8(24)\ - "prefetcht2 128(%%r15); prefetcht2 192(%%r15);"\ INNER_SAVE_m8n24\ - "prefetcht2 256(%%r15); prefetcht2 320(%%r15); addq $384,%%r15;"\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\ "subq $8,%8; cmpq $8,%8; jnb 22221b;"\ "22222:\n\t"\ @@ -392,7 +390,7 @@ INNER_INIT_m4n24\ INNER_KERNELm4(24)\ INNER_SAVE_m4n24\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\ "subq $4,%8;"\ "22223:\n\t"\ @@ -400,7 +398,7 @@ INNER_INIT_m2n24\ INNER_KERNELm2(24)\ INNER_SAVE_m2n24\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $16,%3;"\ "subq $2,%8;"\ "22224:\n\t"\ @@ -408,19 +406,19 @@ INNER_INIT_m1n24\ INNER_KERNELm1(24)\ INNER_SAVE_m1n24\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $8,%3;"\ "22225:\n\t"\ "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\ - :"+r"(a_block_pointer),"+r"(b_scratch),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),\ - "+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(packed_b_pointer)\ + "leaq (%1,%%r12,2),%1; addq %%r12,%1;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\ ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\ - "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r13","r14","r15");\ + "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } -static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8 +static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8 //perform C += A B if(k==0 || m==0 || ndiv8==0) return; int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double); @@ -429,38 +427,17 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG double *c_pointer = c; __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033; BLASLONG ndiv8_count; - double *b_scratch; - posix_memalign(&b_scratch,64,192*k); double *packed_b_pointer = packed_b; a_block_pointer = packed_a; for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){ - __asm__ __volatile__ ( - "testq %2,%2; jz 100002f;movq %2,%%r13;shlq $6,%%r13;" - "100001:\n\t" - "vmovupd (%0),%%zmm5; vmovupd (%0,%%r13,1),%%zmm6; vmovupd (%0,%%r13,2),%%zmm7; addq $64,%0;" - "vmovupd %%zmm5,(%1); vmovupd %%zmm6,64(%1); vmovupd %%zmm7,128(%1); addq $192,%1;" - "decq %2; testq %2,%2; jnz 100001b;" - "100002:\n\t" - "movq %%r13,%2;shrq $6,%2;leaq (%0,%%r13,2),%0;subq %%r13,%1;subq %%r13,%1;subq %%r13,%1;" - :"+r"(packed_b_pointer),"+r"(b_scratch),"+r"(K)::"r13","cc","memory","zmm5","zmm6","zmm7"); COMPUTE_n24 } for(;ndiv8_count>1;ndiv8_count-=2){ - __asm__ __volatile__ ( - "testq %2,%2; jz 1000002f;movq %2,%%r13;shlq $6,%%r13;" - "1000001:\n\t" - "vmovupd (%0),%%zmm5; vmovupd (%0,%%r13,1),%%zmm6; addq $64,%0;" - "vmovupd %%zmm5,(%1); vmovupd %%zmm6,64(%1); addq $128,%1;" - "decq %2; testq %2,%2; jnz 1000001b;" - "1000002:\n\t" - "movq %%r13,%2;shrq $6,%2;leaq (%0,%%r13,1),%0;subq %%r13,%1;subq %%r13,%1;" - :"+r"(packed_b_pointer),"+r"(b_scratch),"+r"(K)::"r13","cc","memory","zmm5","zmm6"); COMPUTE_n16 } if(ndiv8_count>0){ COMPUTE_n8 } - free(b_scratch);b_scratch=NULL; } /* __m512d accumulators: zc1-zc4; temporary variables: za1,zb1-zb2 */ From 6bcb06fcb1d3f2b79fbe67c79db0e0af1d76297c Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 18 Oct 2019 10:47:31 +0800 Subject: [PATCH 6/9] make further changes to icopy_8 easier --- kernel/x86_64/dgemm_kernel_8x8_skylakex.c | 103 +++++++++++++--------- 1 file changed, 61 insertions(+), 42 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c index b8b3234d1..49facd751 100644 --- a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c @@ -1,8 +1,8 @@ #include "common.h" #include #include +//register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. /* row-major c_block */ -/* 64-bit pointer registers: a_block_pointer,b_block_pointer,c_pointer;*/ #define INNER_KERNEL_k1m1n8 \ "prefetcht0 384(%1);"\ "prefetcht0 768(%0); vmovupd (%1),%%zmm5; addq $64,%1;"\ @@ -158,7 +158,7 @@ #define INNER_STORE_m1n8(c1,disp) \ "kxnorw %%k1,%%k1,%%k1;"\ "vgatherqpd "#disp"(%3,%%zmm6,1), %%zmm7 %{%%k1%};"\ - "vaddpd %%zmm7,"#c1","#c1";"\ + "vfmadd132pd %%zmm3,%%zmm7,"#c1";"\ "kxnorw %%k1,%%k1,%%k1;"\ "vscatterqpd "#c1", "#disp"(%3,%%zmm6,1) %{%%k1%};" @@ -227,26 +227,27 @@ "vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\ "vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};" +//%7 for k01(input) only when m=4 #define INNER_STORE_4x8(c1,c2,c3,c4) \ - "vmovupd (%3),%%zmm4%{%5%};vmovupd -32(%3,%4,4),%%zmm4%{%7%};vaddpd %%zmm4,"#c1","#c1";"\ + "vmovupd (%3),%%zmm4%{%5%};vmovupd -32(%3,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\ "vmovupd "#c1",(%3)%{%5%}; vmovupd "#c1",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ - "vmovupd (%3),%%zmm5%{%5%};vmovupd -32(%3,%4,4),%%zmm5%{%7%};vaddpd %%zmm5,"#c2","#c2";"\ + "vmovupd (%3),%%zmm5%{%5%};vmovupd -32(%3,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\ "vmovupd "#c2",(%3)%{%5%}; vmovupd "#c2",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ - "vmovupd (%3),%%zmm6%{%5%};vmovupd -32(%3,%4,4),%%zmm6%{%7%};vaddpd %%zmm6,"#c3","#c3";"\ + "vmovupd (%3),%%zmm6%{%5%};vmovupd -32(%3,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\ "vmovupd "#c3",(%3)%{%5%}; vmovupd "#c3",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ - "vmovupd (%3),%%zmm7%{%5%};vmovupd -32(%3,%4,4),%%zmm7%{%7%};vaddpd %%zmm7,"#c4","#c4";"\ + "vmovupd (%3),%%zmm7%{%5%};vmovupd -32(%3,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\ "vmovupd "#c4",(%3)%{%5%}; vmovupd "#c4",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ "leaq (%3,%4,4),%3;" #define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ - "vaddpd (%3),"#c1","#c1"; vmovupd "#c1",(%3); vaddpd (%3,%4,1),"#c2","#c2"; vmovupd "#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vfmadd213pd (%3),%%zmm3,"#c1"; vmovupd "#c1",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\ "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ - "vaddpd (%3),"#c3","#c3"; vmovupd "#c3",(%3); vaddpd (%3,%4,1),"#c4","#c4"; vmovupd "#c4",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vfmadd213pd (%3),%%zmm3,"#c3"; vmovupd "#c3",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%3,%4,1); leaq (%3,%4,2),%3;"\ "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ - "vaddpd (%3),"#c5","#c5"; vmovupd "#c5",(%3); vaddpd (%3,%4,1),"#c6","#c6"; vmovupd "#c6",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vfmadd213pd (%3),%%zmm3,"#c5"; vmovupd "#c5",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%3,%4,1); leaq (%3,%4,2),%3;"\ "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ - "vaddpd (%3),"#c7","#c7"; vmovupd "#c7",(%3); vaddpd (%3,%4,1),"#c8","#c8"; vmovupd "#c8",(%3,%4,1); leaq (%3,%4,2),%3;" + "vfmadd213pd (%3),%%zmm3,"#c7"; vmovupd "#c7",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%3,%4,1); leaq (%3,%4,2),%3;" #define INNER_SAVE_m4n8 \ INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\ @@ -292,6 +293,7 @@ #define COMPUTE_n8 {\ __asm__ __volatile__(\ + "vbroadcastsd (%9),%%zmm3;"\ "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\ "cmpq $8,%8; jb 42222f;"\ "42221:\n\t"\ @@ -327,12 +329,13 @@ "42225:\n\t"\ "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ "shlq $3,%4;addq %4,%3;shrq $3,%4;"\ - :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\ - ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r13","r14");\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\ + ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r13","r14");\ a_block_pointer -= M * K;\ } #define COMPUTE_n16 {\ __asm__ __volatile__(\ + "vbroadcastsd (%9),%%zmm3;"\ "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\ "cmpq $8,%8; jb 32222f;"\ "32221:\n\t"\ @@ -369,13 +372,14 @@ "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ "shlq $4,%4;addq %4,%3;shrq $4,%4;"\ "leaq (%1,%%r12,2),%1;"\ - :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\ - ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\ + ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } #define COMPUTE_n24 {\ __asm__ __volatile__(\ + "vbroadcastsd (%9),%%zmm3;"\ "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\ "cmpq $8,%8; jb 22222f;"\ "22221:\n\t"\ @@ -412,13 +416,13 @@ "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\ "leaq (%1,%%r12,2),%1; addq %%r12,%1;"\ - :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\ - ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\ + ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\ "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } -static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8 +static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8 //perform C += A B if(k==0 || m==0 || ndiv8==0) return; int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double); @@ -426,7 +430,7 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac double *a_block_pointer; double *c_pointer = c; __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033; - BLASLONG ndiv8_count; + BLASLONG m_count,ndiv8_count,k_count; double *packed_b_pointer = packed_b; a_block_pointer = packed_a; for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){ @@ -474,24 +478,27 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac #define INIT_m8n2 zc2=INIT_m8n1 #define INIT_m8n4 zc4=zc3=INIT_m8n2 #define SAVE_m8n1 {\ - za1 = _mm512_loadu_pd(c_pointer);\ - zc1 = _mm512_add_pd(zc1,za1);\ + __asm__ __volatile__("vbroadcastsd (%0),%1;":"+r"(alpha),"+v"(za1)::"memory");\ + zb1 = _mm512_loadu_pd(c_pointer);\ + zc1 = _mm512_fmadd_pd(zc1,za1,zb1);\ _mm512_storeu_pd(c_pointer,zc1);\ c_pointer += 8;\ } #define SAVE_m8n2 {\ + __asm__ __volatile__("vbroadcastsd (%0),%1;":"+r"(alpha),"+v"(za1)::"memory");\ zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\ - zc1 = _mm512_add_pd(zc1,zb1); zc2 = _mm512_add_pd(zc2,zb2);\ + zc1 = _mm512_fmadd_pd(zc1,za1,zb1); zc2 = _mm512_fmadd_pd(zc2,za1,zb2);\ _mm512_storeu_pd(c_pointer,zc1); _mm512_storeu_pd(c_pointer+LDC,zc2);\ c_pointer += 8;\ } #define SAVE_m8n4 {\ + __asm__ __volatile__("vbroadcastsd (%0),%1;":"+r"(alpha),"+v"(za1)::"memory");\ zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\ - zc1 = _mm512_add_pd(zc1,zb1); zc2 = _mm512_add_pd(zc2,zb2);\ + zc1 = _mm512_fmadd_pd(zc1,za1,zb1); zc2 = _mm512_fmadd_pd(zc2,za1,zb2);\ _mm512_storeu_pd(c_pointer,zc1); _mm512_storeu_pd(c_pointer+LDC,zc2);\ c_pointer += LDC*2;\ zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\ - zc3 = _mm512_add_pd(zc3,zb1); zc4 = _mm512_add_pd(zc4,zb2);\ + zc3 = _mm512_fmadd_pd(zc3,za1,zb1); zc4 = _mm512_fmadd_pd(zc4,za1,zb2);\ _mm512_storeu_pd(c_pointer,zc3); _mm512_storeu_pd(c_pointer+LDC,zc4);\ c_pointer += 8-LDC*2;\ } @@ -518,24 +525,27 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac #define INIT_m4n2 yc2=INIT_m4n1 #define INIT_m4n4 yc4=yc3=INIT_m4n2 #define SAVE_m4n1 {\ + yb1 = _mm256_broadcast_sd(alpha);\ ya1 = _mm256_loadu_pd(c_pointer);\ - yc1 = _mm256_add_pd(yc1,ya1);\ + yc1 = _mm256_fmadd_pd(yc1,yb1,ya1);\ _mm256_storeu_pd(c_pointer,yc1);\ c_pointer += 4;\ } #define SAVE_m4n2 {\ + ya1 = _mm256_broadcast_sd(alpha);\ yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ - yc1 = _mm256_add_pd(yc1,yb1); yc2 = _mm256_add_pd(yc2,yb2);\ + yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\ _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ c_pointer += 4;\ } #define SAVE_m4n4 {\ + ya1 = _mm256_broadcast_sd(alpha);\ yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ - yc1 = _mm256_add_pd(yc1,yb1); yc2 = _mm256_add_pd(yc2,yb2);\ + yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\ _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ c_pointer += LDC*2;\ yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ - yc3 = _mm256_add_pd(yc3,yb1); yc4 = _mm256_add_pd(yc4,yb2);\ + yc3 = _mm256_fmadd_pd(yc3,ya1,yb1); yc4 = _mm256_fmadd_pd(yc4,ya1,yb2);\ _mm256_storeu_pd(c_pointer,yc3); _mm256_storeu_pd(c_pointer+LDC,yc4);\ c_pointer += 4-LDC*2;\ } @@ -553,14 +563,16 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac #define INIT_m2n1 xc1=_mm_setzero_pd(); #define INIT_m2n2 xc2=INIT_m2n1 #define SAVE_m2n1 {\ + xb1 = _mm_loaddup_pd(alpha);\ xa1 = _mm_loadu_pd(c_pointer);\ - xc1 = _mm_add_pd(xc1,xa1);\ + xc1 = _mm_fmadd_pd(xc1,xb1,xa1);\ _mm_storeu_pd(c_pointer,xc1);\ c_pointer += 2;\ } #define SAVE_m2n2 {\ + xa1 = _mm_loaddup_pd(alpha);\ xb1 = _mm_loadu_pd(c_pointer); xb2 = _mm_loadu_pd(c_pointer+LDC);\ - xc1 = _mm_add_pd(xc1,xb1); xc2 = _mm_add_pd(xc2,xb2);\ + xc1 = _mm_fmadd_pd(xc1,xa1,xb1); xc2 = _mm_fmadd_pd(xc2,xa1,xb2);\ _mm_storeu_pd(c_pointer,xc1); _mm_storeu_pd(c_pointer+LDC,xc2);\ c_pointer += 2;\ } @@ -571,7 +583,7 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac } #define INIT_m1n1 sc1=0.0; #define SAVE_m1n1 {\ - *c_pointer += sc1;\ + *c_pointer += sc1 * (*alpha);\ c_pointer++;\ } @@ -596,6 +608,9 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac #define INIT_m1n4 INIT_m4n1 #define INIT_m2n4 INIT_m4n2 #define SAVE_m2n4 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yc1 = _mm256_mul_pd(yc1,ya1);\ + yc2 = _mm256_mul_pd(yc2,ya1);\ yb1 = _mm256_unpacklo_pd(yc1,yc2);\ yb2 = _mm256_unpackhi_pd(yc1,yc2);\ xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer),_mm256_extractf128_pd(yb1,0));\ @@ -609,12 +624,16 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac c_pointer += 2;\ } #define SAVE_m1n2 {\ + xb1 = _mm_loaddup_pd(alpha);\ + xc1 = _mm_mul_pd(xc1,xb1);\ *c_pointer += _mm_cvtsd_f64(xc1);\ xa1 = _mm_unpackhi_pd(xc1,xc1);\ c_pointer[LDC]+= _mm_cvtsd_f64(xa1);\ c_pointer ++;\ } #define SAVE_m1n4 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yc1 = _mm256_mul_pd(yc1,ya1);\ xb1 = _mm256_extractf128_pd(yc1,0);\ *c_pointer += _mm_cvtsd_f64(xb1);\ xb2 = _mm_unpackhi_pd(xb1,xb1);\ @@ -626,7 +645,7 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac c_pointer ++;\ } -static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8 +static void __attribute__ ((noinline)) KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8 //perform C += A B , edge_n<8 must be satisfied ! if(k==0 || m==0 || edge_n==0) return; double *a_block_pointer,*b_block_pointer,*b_base_pointer; @@ -724,30 +743,30 @@ static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG } } } -static void copy_4_to_8(double *src,double *dst,BLASLONG m,BLASLONG k,double alpha){ - BLASLONG m_count,k_count;double *src1,*dst1,*src2;__m256d tmp,alp; - src1 = src; dst1 = dst; src2 = src1 + 4 * k; alp = _mm256_set1_pd(alpha); +static void copy_4_to_8(double *src,double *dst,BLASLONG m,BLASLONG k){ + BLASLONG m_count,k_count;double *src1,*dst1,*src2;__m256d tmp; + src1 = src; dst1 = dst; src2 = src1 + 4 * k; for(m_count=m;m_count>7;m_count-=8){ for(k_count=k;k_count>0;k_count--){ - tmp = _mm256_loadu_pd(src1);tmp = _mm256_mul_pd(tmp,alp);_mm256_storeu_pd(dst1+0,tmp);src1+=4; - tmp = _mm256_loadu_pd(src2);tmp = _mm256_mul_pd(tmp,alp);_mm256_storeu_pd(dst1+4,tmp);src2+=4; + tmp = _mm256_loadu_pd(src1);_mm256_storeu_pd(dst1+0,tmp);src1+=4; + tmp = _mm256_loadu_pd(src2);_mm256_storeu_pd(dst1+4,tmp);src2+=4; dst1+=8; } src1+=4*k;src2+=4*k; } for(;m_count>0;m_count--){ for(k_count=k;k_count>0;k_count--){ - *dst1 = (*src1) * alpha; src1++; dst1++; + *dst1 = (*src1); src1++; dst1++; } } } int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc){ - if(m==0 || n==0 || k==0) return 0; - BLASLONG ndiv8 = n/8; + if(m==0 || n==0 || k==0 || alpha == 0.0) return 0; + BLASLONG ndiv8 = n/8;double ALPHA = alpha; double *packed_a = (double *)malloc(m*k*sizeof(double)); - copy_4_to_8(A,packed_a,m,k,alpha); - if(ndiv8>0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C); - if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8); + copy_4_to_8(A,packed_a,m,k); + if(ndiv8>0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA); + if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA); free(packed_a);packed_a=NULL; return 0; } From 17cdd9f9e17728e4d8a044aa23028aa40b5e9946 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 18 Oct 2019 14:58:07 +0800 Subject: [PATCH 7/9] some correction --- kernel/x86_64/dgemm_kernel_8x8_skylakex.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c index 49facd751..bfd63bbc7 100644 --- a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c @@ -330,7 +330,7 @@ "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ "shlq $3,%4;addq %4,%3;shrq $3,%4;"\ :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\ - ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r13","r14");\ + ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } #define COMPUTE_n16 {\ @@ -645,8 +645,8 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG c_pointer ++;\ } -static void __attribute__ ((noinline)) KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8 -//perform C += A B , edge_n<8 must be satisfied ! +static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8 +//perform C += A B , edge_n<8 must be satisfied. if(k==0 || m==0 || edge_n==0) return; double *a_block_pointer,*b_block_pointer,*b_base_pointer; double *c_pointer = c; @@ -763,11 +763,16 @@ static void copy_4_to_8(double *src,double *dst,BLASLONG m,BLASLONG k){ int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc){ if(m==0 || n==0 || k==0 || alpha == 0.0) return 0; BLASLONG ndiv8 = n/8;double ALPHA = alpha; +#ifdef ICOPY_4 double *packed_a = (double *)malloc(m*k*sizeof(double)); copy_4_to_8(A,packed_a,m,k); +#else //ICOPY_8 + double *packed_a = A; +#endif if(ndiv8>0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA); if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA); +#ifdef ICOPY_4 free(packed_a);packed_a=NULL; +#endif return 0; } - From 0d669e04bb2716b6a7767f0f449a2b09caf2d456 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 18 Oct 2019 15:00:17 +0800 Subject: [PATCH 8/9] Update dgemm_kernel_8x8_skylakex.c --- kernel/x86_64/dgemm_kernel_8x8_skylakex.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c index bfd63bbc7..1139090e2 100644 --- a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c @@ -1,6 +1,8 @@ #include "common.h" #include #include + +#define ICOPY_4 //register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. /* row-major c_block */ #define INNER_KERNEL_k1m1n8 \ @@ -743,6 +745,7 @@ static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG } } } +#ifdef ICOPY_4 static void copy_4_to_8(double *src,double *dst,BLASLONG m,BLASLONG k){ BLASLONG m_count,k_count;double *src1,*dst1,*src2;__m256d tmp; src1 = src; dst1 = dst; src2 = src1 + 4 * k; @@ -760,6 +763,7 @@ static void copy_4_to_8(double *src,double *dst,BLASLONG m,BLASLONG k){ } } } +#endif int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc){ if(m==0 || n==0 || k==0 || alpha == 0.0) return 0; BLASLONG ndiv8 = n/8;double ALPHA = alpha; From 6ff013bae0965d791841aead9c9dcb0e27e1b7be Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 19 Oct 2019 03:54:44 +0800 Subject: [PATCH 9/9] native support for icopy_4 90% MKL 1-thread performance. --- kernel/x86_64/KERNEL.SKYLAKEX | 2 +- kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c | 666 ++++++++++++++++++++ 2 files changed, 667 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index d73a47925..82a455b44 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -7,7 +7,7 @@ SGEMMITCOPY = sgemm_tcopy_16_skylakex.c SGEMMONCOPY = sgemm_ncopy_4_skylakex.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMKERNEL = dgemm_kernel_8x8_skylakex.c +DGEMMKERNEL = dgemm_kernel_4x8_skylakex_2.c DGEMMONCOPY = dgemm_ncopy_8_skylakex.c DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c new file mode 100644 index 000000000..a958a1a6f --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c @@ -0,0 +1,666 @@ +#include "common.h" +#include +#include + +//register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. +/* row-major c_block */ +#define INNER_KERNEL_k1m1n8 \ + "prefetcht0 384(%1);"\ + "vmovupd (%1),%%zmm5; addq $64,%1;"\ + "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8;" + +#define INNER_KERNEL_k1m2n8 \ + INNER_KERNEL_k1m1n8\ + "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;" + +#define INNER_KERNEL_k1m4n8 \ + INNER_KERNEL_k1m2n8\ + "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;"\ + "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;" + +#define INNER_KERNEL_k1m8n8 \ + INNER_KERNEL_k1m4n8\ + "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;"\ + "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm13;"\ + "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;"\ + "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;" + +#define INNER_KERNEL_k1m1n16 \ + "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\ + "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\ + "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;" + +#define INNER_KERNEL_k1m2n16 \ + INNER_KERNEL_k1m1n16\ + "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;" + +#define INNER_KERNEL_k1m4n16 \ + INNER_KERNEL_k1m2n16\ + "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;vfmadd231pd %%zmm6,%%zmm4,%%zmm13;"\ + "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;" + +#define INNER_KERNEL_k1m8n16 \ + INNER_KERNEL_k1m4n16\ + "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm16;vfmadd231pd %%zmm6,%%zmm4,%%zmm17;"\ + "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm18;vfmadd231pd %%zmm6,%%zmm4,%%zmm19;"\ + "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;"\ + "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;" + +#define INNER_KERNEL_k1m1n24 \ + "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\ + "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\ + "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;" + +#define INNER_KERNEL_k1m2n24 \ + INNER_KERNEL_k1m1n24\ + "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;" + +#define INNER_KERNEL_k1m4n24 \ + INNER_KERNEL_k1m2n24\ + "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;vfmadd231pd %%zmm7,%%zmm4,%%zmm16;"\ + "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm17;vfmadd231pd %%zmm6,%%zmm4,%%zmm18;vfmadd231pd %%zmm7,%%zmm4,%%zmm19;" + +#define INNER_KERNEL_k1m8n24 \ + INNER_KERNEL_k1m4n24\ + "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;vfmadd231pd %%zmm7,%%zmm4,%%zmm22;"\ + "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm23;vfmadd231pd %%zmm6,%%zmm4,%%zmm24;vfmadd231pd %%zmm7,%%zmm4,%%zmm25;"\ + "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm26;vfmadd231pd %%zmm6,%%zmm4,%%zmm27;vfmadd231pd %%zmm7,%%zmm4,%%zmm28;"\ + "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm29;vfmadd231pd %%zmm6,%%zmm4,%%zmm30;vfmadd231pd %%zmm7,%%zmm4,%%zmm31;" + +#define INNER_KERNELm1(nn) \ + "cmpq $1,%2;jb "#nn"3f;"\ + #nn"4:\n\t"\ + INNER_KERNEL_k1m1n##nn "addq $8,%0;"\ + "decq %2;cmpq $1,%2;jnb "#nn"4b;"\ + #nn"3:\n\t" + +#define INNER_KERNELm2(nn) \ + "cmpq $1,%2;jb "#nn"0f;"\ + #nn"1:\n\t"\ + INNER_KERNEL_k1m2n##nn "addq $16,%0;"\ + "decq %2;cmpq $1,%2;jnb "#nn"1b;"\ + #nn"0:\n\t" + +#define INNER_KERNELm4(nn) \ + "cmpq $1,%2;jb "#nn"00f;"\ + #nn"01:\n\t"\ + INNER_KERNEL_k1m4n##nn "addq $32,%0;"\ + "decq %2;cmpq $1,%2;jnb "#nn"01b;"\ + #nn"00:\n\t" + +#define INNER_KERNELm8(nn) \ + "cmpq $8,%2;jb "#nn"001f;"\ + #nn"008:\n\t"\ + INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + "subq $8,%2;cmpq $8,%2;jnb "#nn"008b;"\ + #nn"001:\n\t"\ + "cmpq $1,%2;jb "#nn"000f;"\ + INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + "decq %2;jmp "#nn"001b;"\ + ""#nn"000:\n\t" + +#define INNER_INIT_m1n8 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8;" + +#define INNER_INIT_m2n8 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9;" + +#define INNER_INIT_m4n8 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;" + +#define INNER_INIT_m8n8 \ + INNER_INIT_m4n8\ + "vpxorq %%zmm12,%%zmm12,%%zmm12;vpxorq %%zmm13,%%zmm13,%%zmm13;vpxorq %%zmm14,%%zmm14,%%zmm14;vpxorq %%zmm15,%%zmm15,%%zmm15;" + +#define INNER_INIT_m1n16 INNER_INIT_m2n8 + +#define INNER_INIT_m2n16 INNER_INIT_m4n8 + +#define INNER_INIT_m4n16 INNER_INIT_m8n8 + +#define INNER_INIT_m8n16 \ + INNER_INIT_m8n8\ + "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"\ + "vpxorq %%zmm20,%%zmm20,%%zmm20;vpxorq %%zmm21,%%zmm21,%%zmm21;vpxorq %%zmm22,%%zmm22,%%zmm22;vpxorq %%zmm23,%%zmm23,%%zmm23;" + +#define INNER_INIT_m1n24 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;" + +#define INNER_INIT_m2n24 \ + INNER_INIT_m1n24\ + "vpxorq %%zmm11,%%zmm11,%%zmm11; vpxorq %%zmm12,%%zmm12,%%zmm12; vpxorq %%zmm13,%%zmm13,%%zmm13;" + +#define INNER_INIT_m4n24 \ + INNER_INIT_m4n16\ + "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;" + +#define INNER_INIT_m8n24 \ + INNER_INIT_m8n16\ + "vpxorq %%zmm24,%%zmm24,%%zmm24;vpxorq %%zmm25,%%zmm25,%%zmm25;vpxorq %%zmm26,%%zmm26,%%zmm26;vpxorq %%zmm27,%%zmm27,%%zmm27;"\ + "vpxorq %%zmm28,%%zmm28,%%zmm28;vpxorq %%zmm29,%%zmm29,%%zmm29;vpxorq %%zmm30,%%zmm30,%%zmm30;vpxorq %%zmm31,%%zmm31,%%zmm31;" + +#define INNER_SETINDEX \ + "vpinsrq $0,%4,%%xmm4,%%xmm4; vbroadcastsd %%xmm4,%%zmm4;"\ + "kxnorw %%k1,%%k1,%%k1; kshiftlw $1,%%k1,%%k1; vpxorq %%zmm6,%%zmm6,%%zmm6; vmovapd %%zmm4,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};" + +#define INNER_STORE_m1n8(c1,disp) \ + "kxnorw %%k1,%%k1,%%k1;"\ + "vgatherqpd "#disp"(%3,%%zmm6,1), %%zmm7 %{%%k1%};"\ + "vfmadd132pd %%zmm3,%%zmm7,"#c1";"\ + "kxnorw %%k1,%%k1,%%k1;"\ + "vscatterqpd "#c1", "#disp"(%3,%%zmm6,1) %{%%k1%};" + +#define INNER_SAVE_m1n8 \ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0) + +#define INNER_SAVE_m1n16 \ + INNER_SAVE_m1n8\ + "leaq (%3,%4,8),%3;"\ + INNER_STORE_m1n8(%%zmm9,0) + +#define INNER_SAVE_m1n24 \ + INNER_SAVE_m1n16\ + "leaq (%3,%4,8),%3;"\ + INNER_STORE_m1n8(%%zmm10,0) + +#define INNER_SAVE_m2n8 \ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0)\ + INNER_STORE_m1n8(%%zmm9,8) + +#define INNER_SAVE_m2n16 \ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0)\ + INNER_STORE_m1n8(%%zmm10,8)\ + "leaq (%3,%4,8),%3;"\ + INNER_STORE_m1n8(%%zmm9,0)\ + INNER_STORE_m1n8(%%zmm11,8) +#define INNER_SAVE_m2n24 \ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0)\ + INNER_STORE_m1n8(%%zmm11,8)\ + "leaq (%3,%4,8),%3;"\ + INNER_STORE_m1n8(%%zmm9,0)\ + INNER_STORE_m1n8(%%zmm12,8)\ + "leaq (%3,%4,8),%3;"\ + INNER_STORE_m1n8(%%zmm10,0)\ + INNER_STORE_m1n8(%%zmm13,8) +#define INNER_PREF_8x8 \ + "prefetcht0 (%3); prefetcht0 56(%3); prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2);"\ + "prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,2),%3;"\ + "prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,1),%3;"\ + "prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4);"\ + "subq %4,%3; subq %4,%3; subq %4,%3;" +#define INNER_TRANS_4x8(c1,c2,c3,c4) \ + "vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\ + "vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\ + "vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\ + "vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\ + "vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\ + "vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";" +#define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ + INNER_TRANS_4x8(c1,c2,c3,c4)\ + INNER_TRANS_4x8(c5,c6,c7,c8)\ + "vblendmpd "#c5","#c1",%%zmm4%{%5%};vshuff64x2 $0x4e,%%zmm4,%%zmm4,%%zmm4;"\ + "vblendmpd "#c1",%%zmm4,"#c1"%{%5%};vblendmpd %%zmm4,"#c5","#c5"%{%5%};"\ + "vblendmpd "#c6","#c2",%%zmm5%{%5%};vshuff64x2 $0x4e,%%zmm5,%%zmm5,%%zmm5;"\ + "vblendmpd "#c2",%%zmm5,"#c2"%{%5%};vblendmpd %%zmm5,"#c6","#c6"%{%5%};"\ + "vblendmpd "#c7","#c3",%%zmm6%{%5%};vshuff64x2 $0x4e,%%zmm6,%%zmm6,%%zmm6;"\ + "vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd %%zmm6,"#c7","#c7"%{%5%};"\ + "vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\ + "vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};" +//%7 for k01(input) only when m=4 +#define INNER_STORE_4x8(c1,c2,c3,c4) \ + "vmovupd (%3),%%zmm4%{%5%};vmovupd -32(%3,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\ + "vmovupd "#c1",(%3)%{%5%}; vmovupd "#c1",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ + "vmovupd (%3),%%zmm5%{%5%};vmovupd -32(%3,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\ + "vmovupd "#c2",(%3)%{%5%}; vmovupd "#c2",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ + "vmovupd (%3),%%zmm6%{%5%};vmovupd -32(%3,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\ + "vmovupd "#c3",(%3)%{%5%}; vmovupd "#c3",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ + "vmovupd (%3),%%zmm7%{%5%};vmovupd -32(%3,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\ + "vmovupd "#c4",(%3)%{%5%}; vmovupd "#c4",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ + "leaq (%3,%4,4),%3;" +#define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ + "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ + "vfmadd213pd (%3),%%zmm3,"#c1"; vmovupd "#c1",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ + "vfmadd213pd (%3),%%zmm3,"#c3"; vmovupd "#c3",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ + "vfmadd213pd (%3),%%zmm3,"#c5"; vmovupd "#c5",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ + "vfmadd213pd (%3),%%zmm3,"#c7"; vmovupd "#c7",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%3,%4,1); leaq (%3,%4,2),%3;" +#define INNER_SAVE_m4n8 \ + INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\ + INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11) +#define INNER_SAVE_m4n16 \ + INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\ + INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\ + INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\ + INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15) +#define INNER_SAVE_m4n24 \ + INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\ + INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\ + INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\ + INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\ + INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\ + INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19) +#define INNER_SAVE_m8n8 \ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ + INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15) +#define INNER_SAVE_m8n16 \ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\ + INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\ + INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23) +#define INNER_SAVE_m8n24 \ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\ + INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\ + INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\ + INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31) + +#define COMPUTE_n8 {\ + __asm__ __volatile__(\ + "vbroadcastsd (%9),%%zmm3;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ + "cmpq $8,%8; jb 42222f;"\ + "42221:\n\t"\ + INNER_INIT_m8n8\ + INNER_KERNELm8(8)\ + INNER_SAVE_m8n8\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ + "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $64,%3;"\ + "subq $8,%8; cmpq $8,%8; jnb 42221b;"\ + "42222:\n\t"\ + "cmpq $4,%8; jb 42223f;"\ + INNER_INIT_m4n8\ + INNER_KERNELm4(8)\ + INNER_SAVE_m4n8\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $32,%3;"\ + "subq $4,%8;"\ + "42223:\n\t"\ + "cmpq $2,%8; jb 42224f;"\ + INNER_INIT_m2n8\ + INNER_KERNELm2(8)\ + INNER_SAVE_m2n8\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $16,%3;"\ + "subq $2,%8;"\ + "42224:\n\t"\ + "cmpq $1,%8; jb 42225f;"\ + INNER_INIT_m1n8\ + INNER_KERNELm1(8)\ + INNER_SAVE_m1n8\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $8,%3;"\ + "42225:\n\t"\ + "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ + "shlq $3,%4;addq %4,%3;shrq $3,%4;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\ + ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ + a_block_pointer -= M * K;\ +} +#define COMPUTE_n16 {\ + __asm__ __volatile__(\ + "vbroadcastsd (%9),%%zmm3;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ + "cmpq $8,%8; jb 32222f;"\ + "32221:\n\t"\ + INNER_INIT_m8n16\ + INNER_KERNELm8(16)\ + INNER_SAVE_m8n16\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ + "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\ + "subq $8,%8; cmpq $8,%8; jnb 32221b;"\ + "32222:\n\t"\ + "cmpq $4,%8; jb 32223f;"\ + INNER_INIT_m4n16\ + INNER_KERNELm4(16)\ + INNER_SAVE_m4n16\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\ + "subq $4,%8;"\ + "32223:\n\t"\ + "cmpq $2,%8; jb 32224f;"\ + INNER_INIT_m2n16\ + INNER_KERNELm2(16)\ + INNER_SAVE_m2n16\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $16,%3;"\ + "subq $2,%8;"\ + "32224:\n\t"\ + "cmpq $1,%8; jb 32225f;"\ + INNER_INIT_m1n16\ + INNER_KERNELm1(16)\ + INNER_SAVE_m1n16\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $8,%3;"\ + "32225:\n\t"\ + "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ + "shlq $4,%4;addq %4,%3;shrq $4,%4;"\ + "leaq (%1,%%r12,4),%1;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\ + ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ + "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\ + a_block_pointer -= M * K;\ +} +#define COMPUTE_n24 {\ + __asm__ __volatile__(\ + "vbroadcastsd (%9),%%zmm3;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ + "cmpq $8,%8; jb 22222f;"\ + "22221:\n\t"\ + INNER_INIT_m8n24\ + INNER_KERNELm8(24)\ + INNER_SAVE_m8n24\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ + "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\ + "subq $8,%8; cmpq $8,%8; jnb 22221b;"\ + "22222:\n\t"\ + "cmpq $4,%8; jb 22223f;"\ + INNER_INIT_m4n24\ + INNER_KERNELm4(24)\ + INNER_SAVE_m4n24\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\ + "subq $4,%8;"\ + "22223:\n\t"\ + "cmpq $2,%8; jb 22224f;"\ + INNER_INIT_m2n24\ + INNER_KERNELm2(24)\ + INNER_SAVE_m2n24\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $16,%3;"\ + "subq $2,%8;"\ + "22224:\n\t"\ + "cmpq $1,%8; jb 22225f;"\ + INNER_INIT_m1n24\ + INNER_KERNELm1(24)\ + INNER_SAVE_m1n24\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $8,%3;"\ + "22225:\n\t"\ + "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ + "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\ + "leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\ + ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\ + "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ + a_block_pointer -= M * K;\ +} +static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8 +//perform C += A B + if(k==0 || m==0 || ndiv8==0) return; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double); + int64_t K = (int64_t)k; int64_t M = (int64_t)m; + double *a_block_pointer; + double *c_pointer = c; + __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033; + BLASLONG ndiv8_count; + double *packed_b_pointer = packed_b; + a_block_pointer = packed_a; + for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){ + COMPUTE_n24 + } + for(;ndiv8_count>1;ndiv8_count-=2){ + COMPUTE_n16 + } + if(ndiv8_count>0){ + COMPUTE_n8 + } +} + +/* __m256d accumulators: yc1-yc4; temporary variables: ya1,yb1-yb2 */ +/* __m128d accumulators: xc1-xc2; temporary variables: xa1,xb1-xb2 */ +/* double accumulator: sc1; temporary variables: sa1,sb1 */ +/* column-major c_block */ +#define KERNEL_m4n4k1 {\ + ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ + yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\ + yb1 = _mm256_broadcast_sd(b_block_pointer+2); yc3 = _mm256_fmadd_pd(ya1,yb1,yc3);\ + yb2 = _mm256_broadcast_sd(b_block_pointer+3); yc4 = _mm256_fmadd_pd(ya1,yb2,yc4);\ + b_block_pointer+=4;\ +} +#define KERNEL_m4n2k1 {\ + ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ + yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\ + b_block_pointer+=2;\ +} +#define KERNEL_m4n1k1 {\ + ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ + yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + b_block_pointer++;\ +} +#define INIT_m4n1 yc1=_mm256_setzero_pd(); +#define INIT_m4n2 yc2=INIT_m4n1 +#define INIT_m4n4 yc4=yc3=INIT_m4n2 +#define SAVE_m4n1 {\ + yb1 = _mm256_broadcast_sd(alpha);\ + ya1 = _mm256_loadu_pd(c_pointer);\ + yc1 = _mm256_fmadd_pd(yc1,yb1,ya1);\ + _mm256_storeu_pd(c_pointer,yc1);\ + c_pointer += 4;\ +} +#define SAVE_m4n2 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ + yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\ + _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ + c_pointer += 4;\ +} +#define SAVE_m4n4 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ + yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\ + _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ + c_pointer += LDC*2;\ + yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ + yc3 = _mm256_fmadd_pd(yc3,ya1,yb1); yc4 = _mm256_fmadd_pd(yc4,ya1,yb2);\ + _mm256_storeu_pd(c_pointer,yc3); _mm256_storeu_pd(c_pointer+LDC,yc4);\ + c_pointer += 4-LDC*2;\ +} +#define KERNEL_m2n2k1 {\ + xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\ + xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ + xb2 = _mm_loaddup_pd(b_block_pointer+1); xc2 = _mm_fmadd_pd(xa1,xb2,xc2);\ + b_block_pointer += 2;\ +} +#define KERNEL_m2n1k1 {\ + xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\ + xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ + b_block_pointer ++;\ +} +#define INIT_m2n1 xc1=_mm_setzero_pd(); +#define INIT_m2n2 xc2=INIT_m2n1 +#define SAVE_m2n1 {\ + xb1 = _mm_loaddup_pd(alpha);\ + xa1 = _mm_loadu_pd(c_pointer);\ + xc1 = _mm_fmadd_pd(xc1,xb1,xa1);\ + _mm_storeu_pd(c_pointer,xc1);\ + c_pointer += 2;\ +} +#define SAVE_m2n2 {\ + xa1 = _mm_loaddup_pd(alpha);\ + xb1 = _mm_loadu_pd(c_pointer); xb2 = _mm_loadu_pd(c_pointer+LDC);\ + xc1 = _mm_fmadd_pd(xc1,xa1,xb1); xc2 = _mm_fmadd_pd(xc2,xa1,xb2);\ + _mm_storeu_pd(c_pointer,xc1); _mm_storeu_pd(c_pointer+LDC,xc2);\ + c_pointer += 2;\ +} +#define KERNEL_m1n1k1 {\ + sa1 = *a_block_pointer; a_block_pointer++;\ + sb1 = *b_block_pointer; sc1 += sa1 * sb1;\ + b_block_pointer ++;\ +} +#define INIT_m1n1 sc1=0.0; +#define SAVE_m1n1 {\ + *c_pointer += sc1 * (*alpha);\ + c_pointer++;\ +} +/* row-major c_block */ +#define KERNEL_m2n4k1 {\ + yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\ + ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + ya1 = _mm256_broadcast_sd(a_block_pointer+1);yc2 = _mm256_fmadd_pd(ya1,yb1,yc2);\ + a_block_pointer += 2;\ +} +#define KERNEL_m1n4k1 {\ + yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\ + ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + a_block_pointer ++;\ +} +#define KERNEL_m1n2k1 {\ + xb1 = _mm_loadu_pd(b_block_pointer);b_block_pointer+=2;\ + xa1 = _mm_loaddup_pd(a_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ + a_block_pointer ++;\ +} +#define INIT_m1n2 INIT_m2n1 +#define INIT_m1n4 INIT_m4n1 +#define INIT_m2n4 INIT_m4n2 +#define SAVE_m2n4 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yc1 = _mm256_mul_pd(yc1,ya1);\ + yc2 = _mm256_mul_pd(yc2,ya1);\ + yb1 = _mm256_unpacklo_pd(yc1,yc2);\ + yb2 = _mm256_unpackhi_pd(yc1,yc2);\ + xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer),_mm256_extractf128_pd(yb1,0));\ + xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+LDC),_mm256_extractf128_pd(yb2,0));\ + _mm_storeu_pd(c_pointer,xb1);\ + _mm_storeu_pd(c_pointer+LDC,xb2);\ + xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer+2*LDC),_mm256_extractf128_pd(yb1,1));\ + xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+3*LDC),_mm256_extractf128_pd(yb2,1));\ + _mm_storeu_pd(c_pointer+2*LDC,xb1);\ + _mm_storeu_pd(c_pointer+3*LDC,xb2);\ + c_pointer += 2;\ +} +#define SAVE_m1n2 {\ + xb1 = _mm_loaddup_pd(alpha);\ + xc1 = _mm_mul_pd(xc1,xb1);\ + *c_pointer += _mm_cvtsd_f64(xc1);\ + xa1 = _mm_unpackhi_pd(xc1,xc1);\ + c_pointer[LDC]+= _mm_cvtsd_f64(xa1);\ + c_pointer ++;\ +} +#define SAVE_m1n4 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yc1 = _mm256_mul_pd(yc1,ya1);\ + xb1 = _mm256_extractf128_pd(yc1,0);\ + *c_pointer += _mm_cvtsd_f64(xb1);\ + xb2 = _mm_unpackhi_pd(xb1,xb1);\ + c_pointer[LDC] += _mm_cvtsd_f64(xb2);\ + xb1 = _mm256_extractf128_pd(yc1,1);\ + c_pointer[LDC*2] += _mm_cvtsd_f64(xb1);\ + xb2 = _mm_unpackhi_pd(xb1,xb1);\ + c_pointer[LDC*3] += _mm_cvtsd_f64(xb2);\ + c_pointer ++;\ +} +static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8 +//perform C += A B , edge_n<8 must be satisfied. + if(k==0 || m==0 || edge_n==0 || (*alpha)==0.0) return; + double *a_block_pointer,*b_block_pointer,*b_base_pointer; + double *c_pointer = c; + __m256d yc1,yc2,yc3,yc4,ya1,yb1,yb2; + __m128d xc1,xc2,xa1,xb1,xb2; + double sc1,sa1,sb1; + BLASLONG m_count,n_count,k_count; + b_base_pointer = packed_b; +//now start calculation of the edge part + for(n_count=edge_n;n_count>3;n_count-=4){ + a_block_pointer = packed_a; + for(m_count=m;m_count>3;m_count-=4){ + b_block_pointer = b_base_pointer; + INIT_m4n4 + for(k_count=0;k_count1;m_count-=2){ + b_block_pointer = b_base_pointer; + INIT_m2n4 + for(k_count=0;k_count0){ + b_block_pointer = b_base_pointer; + INIT_m1n4 + for(k_count=0;k_count1;n_count-=2){ + a_block_pointer = packed_a; + for(m_count=m;m_count>3;m_count-=4){ + b_block_pointer = b_base_pointer; + INIT_m4n2 + for(k_count=0;k_count1;m_count-=2){ + b_block_pointer = b_base_pointer; + INIT_m2n2 + for(k_count=0;k_count0){ + b_block_pointer = b_base_pointer; + INIT_m1n2 + for(k_count=0;k_count0){ + a_block_pointer = packed_a; + for(m_count=m;m_count>3;m_count-=4){ + b_block_pointer = b_base_pointer; + INIT_m4n1 + for(k_count=0;k_count1;m_count-=2){ + b_block_pointer = b_base_pointer; + INIT_m2n1 + for(k_count=0;k_count0){ + b_block_pointer = b_base_pointer; + INIT_m1n1 + for(k_count=0;k_count0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA); + if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA); + return 0; +}