diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c index 72878acfd..51b0b94fa 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c @@ -2,7 +2,8 @@ #include #include -//register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. +//register usage: zmm3 for alpha, zmm0-zmm2 and zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. + /* row-major c_block */ #define INNER_KERNEL_k1m1n8 \ "prefetcht0 384(%1);"\ @@ -13,18 +14,6 @@ INNER_KERNEL_k1m1n8\ "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;" -#define INNER_KERNEL_k1m4n8 \ - INNER_KERNEL_k1m2n8\ - "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;"\ - "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;" - -#define INNER_KERNEL_k1m8n8 \ - INNER_KERNEL_k1m4n8\ - "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;"\ - "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm13;"\ - "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;"\ - "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;" - #define INNER_KERNEL_k1m1n16 \ "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\ "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\ @@ -34,18 +23,6 @@ INNER_KERNEL_k1m1n16\ "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;" -#define INNER_KERNEL_k1m4n16 \ - INNER_KERNEL_k1m2n16\ - "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;vfmadd231pd %%zmm6,%%zmm4,%%zmm13;"\ - "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;" - -#define INNER_KERNEL_k1m8n16 \ - INNER_KERNEL_k1m4n16\ - "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm16;vfmadd231pd %%zmm6,%%zmm4,%%zmm17;"\ - "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm18;vfmadd231pd %%zmm6,%%zmm4,%%zmm19;"\ - "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;"\ - "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;" - #define INNER_KERNEL_k1m1n24 \ "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\ "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\ @@ -55,18 +32,48 @@ INNER_KERNEL_k1m1n24\ "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;" +/* row-major z-partition c_block */ +#define INNER_KERNEL_k1m4n8 \ + "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5; addq $32,%0;"\ + "vmovddup (%1),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm8; vfmadd231pd %%zmm5,%%zmm6,%%zmm10;"\ + "vmovddup 8(%1),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm9; vfmadd231pd %%zmm5,%%zmm7,%%zmm11;" + +#define INNER_KERNEL_k1m4n16 \ + INNER_KERNEL_k1m4n8\ + "vmovddup (%1,%%r12,2),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm12; vfmadd231pd %%zmm5,%%zmm6,%%zmm14;"\ + "vmovddup 8(%1,%%r12,2),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm13; vfmadd231pd %%zmm5,%%zmm7,%%zmm15;" + #define INNER_KERNEL_k1m4n24 \ - INNER_KERNEL_k1m2n24\ - "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;vfmadd231pd %%zmm7,%%zmm4,%%zmm16;"\ - "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm17;vfmadd231pd %%zmm6,%%zmm4,%%zmm18;vfmadd231pd %%zmm7,%%zmm4,%%zmm19;" + INNER_KERNEL_k1m4n16\ + "vmovddup (%1,%%r12,4),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm16; vfmadd231pd %%zmm5,%%zmm6,%%zmm18;"\ + "vmovddup 8(%1,%%r12,4),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm17; vfmadd231pd %%zmm5,%%zmm7,%%zmm19;" + +#define INNER_KERNEL_k1m8n8 \ + "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5;"\ + "vbroadcastf32x4 (%0,%%r12,1),%%zmm6; vbroadcastf32x4 16(%0,%%r12,1),%%zmm7; addq $32,%0;"\ + "prefetcht0 128(%1);"\ + "vmovddup (%1),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm8; vfmadd231pd %%zmm5,%%zmm2,%%zmm10;"\ + "vfmadd231pd %%zmm6,%%zmm2,%%zmm12; vfmadd231pd %%zmm7,%%zmm2,%%zmm14;"\ + "vmovddup 8(%1),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm9; vfmadd231pd %%zmm5,%%zmm1,%%zmm11;"\ + "vfmadd231pd %%zmm6,%%zmm1,%%zmm13; vfmadd231pd %%zmm7,%%zmm1,%%zmm15;" + +#define INNER_KERNEL_k1m8n16 \ + INNER_KERNEL_k1m8n8\ + "prefetcht0 128(%1,%%r12,2);"\ + "vmovddup (%1,%%r12,2),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm16; vfmadd231pd %%zmm5,%%zmm2,%%zmm18;"\ + "vfmadd231pd %%zmm6,%%zmm2,%%zmm20; vfmadd231pd %%zmm7,%%zmm2,%%zmm22;"\ + "vmovddup 8(%1,%%r12,2),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm17; vfmadd231pd %%zmm5,%%zmm1,%%zmm19;"\ + "vfmadd231pd %%zmm6,%%zmm1,%%zmm21; vfmadd231pd %%zmm7,%%zmm1,%%zmm23;" #define INNER_KERNEL_k1m8n24 \ - INNER_KERNEL_k1m4n24\ - "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;vfmadd231pd %%zmm7,%%zmm4,%%zmm22;"\ - "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm23;vfmadd231pd %%zmm6,%%zmm4,%%zmm24;vfmadd231pd %%zmm7,%%zmm4,%%zmm25;"\ - "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm26;vfmadd231pd %%zmm6,%%zmm4,%%zmm27;vfmadd231pd %%zmm7,%%zmm4,%%zmm28;"\ - "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm29;vfmadd231pd %%zmm6,%%zmm4,%%zmm30;vfmadd231pd %%zmm7,%%zmm4,%%zmm31;" + INNER_KERNEL_k1m8n16\ + "prefetcht0 128(%1,%%r12,4);"\ + "vmovddup (%1,%%r12,4),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm24; vfmadd231pd %%zmm5,%%zmm2,%%zmm26;"\ + "vfmadd231pd %%zmm6,%%zmm2,%%zmm28; vfmadd231pd %%zmm7,%%zmm2,%%zmm30;"\ + "vmovddup 8(%1,%%r12,4),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm25; vfmadd231pd %%zmm5,%%zmm1,%%zmm27;"\ + "vfmadd231pd %%zmm6,%%zmm1,%%zmm29; vfmadd231pd %%zmm7,%%zmm1,%%zmm31;" +/* micro kernels */ #define INNER_KERNELm1(nn) \ "cmpq $1,%2;jb "#nn"3f;"\ #nn"4:\n\t"\ @@ -84,7 +91,7 @@ #define INNER_KERNELm4(nn) \ "cmpq $1,%2;jb "#nn"00f;"\ #nn"01:\n\t"\ - INNER_KERNEL_k1m4n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m4n##nn "addq $64,%1;"\ "decq %2;cmpq $1,%2;jnb "#nn"01b;"\ #nn"00:\n\t" @@ -92,18 +99,18 @@ #define INNER_KERNELm8(nn) \ "movq %3,%10;cmpq $16,%2;jb "#nn"001f;"\ #nn"008:\n\t"\ - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ "prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\ - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ "prefetcht1 (%11); addq $16,%11;"\ "subq $4,%2;cmpq $16,%2;jnb "#nn"008b;"\ "movq %3,%10;"\ #nn"001:\n\t"\ "cmpq $1,%2;jb "#nn"000f;"\ "prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\ - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ "decq %2;jmp "#nn"001b;"\ ""#nn"000:\n\t" @@ -207,24 +214,19 @@ INNER_STORE_m1n8(%%zmm13,8) #define INNER_TRANS_4x8(c1,c2,c3,c4) \ - "vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\ - "vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\ - "vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\ - "vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\ - "vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\ - "vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";" + "vblendmpd "#c3","#c1",%%zmm4%{%6%}; vblendmpd "#c4","#c2",%%zmm6%{%6%};"\ + "vshuff64x2 $177,%%zmm4,%%zmm4,%%zmm4; vshuff64x2 $177,%%zmm6,%%zmm6,%%zmm6;"\ + "vblendmpd "#c1",%%zmm4,"#c1"%{%6%}; vblendmpd "#c2",%%zmm6,"#c2"%{%6%};"\ + "vblendmpd %%zmm4,"#c3","#c3"%{%6%}; vblendmpd %%zmm6,"#c4","#c4"%{%6%};"\ + +#define INNER_TRANS_f128_4x4(c1,c2,c3,c4) \ + "vshuff64x2 $68,"#c3","#c1",%%zmm4; vshuff64x2 $17,"#c4","#c2",%%zmm5;"\ + "vshuff64x2 $238,"#c3","#c1",%%zmm6; vshuff64x2 $187,"#c4","#c2",%%zmm7;"\ + "vblendmpd %%zmm5,%%zmm4,"#c2"%{%6%}; vshuff64x2 $177,"#c2","#c2","#c2"; vblendmpd %%zmm4,%%zmm5,"#c1"%{%6%};"\ + "vblendmpd %%zmm7,%%zmm6,"#c4"%{%6%}; vshuff64x2 $177,"#c4","#c4","#c4"; vblendmpd %%zmm6,%%zmm7,"#c3"%{%6%};" #define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ - INNER_TRANS_4x8(c1,c2,c3,c4)\ - INNER_TRANS_4x8(c5,c6,c7,c8)\ - "vblendmpd "#c5","#c1",%%zmm4%{%5%};vshuff64x2 $0x4e,%%zmm4,%%zmm4,%%zmm4;"\ - "vblendmpd "#c1",%%zmm4,"#c1"%{%5%};vblendmpd %%zmm4,"#c5","#c5"%{%5%};"\ - "vblendmpd "#c6","#c2",%%zmm5%{%5%};vshuff64x2 $0x4e,%%zmm5,%%zmm5,%%zmm5;"\ - "vblendmpd "#c2",%%zmm5,"#c2"%{%5%};vblendmpd %%zmm5,"#c6","#c6"%{%5%};"\ - "vblendmpd "#c7","#c3",%%zmm6%{%5%};vshuff64x2 $0x4e,%%zmm6,%%zmm6,%%zmm6;"\ - "vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd %%zmm6,"#c7","#c7"%{%5%};"\ - "vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\ - "vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};" + INNER_TRANS_f128_4x4(c1,c3,c5,c7) INNER_TRANS_f128_4x4(c2,c4,c6,c8) //%7 for k01(input) only when m=4 #define INNER_STORE_4x8(c1,c2,c3,c4) \ @@ -250,20 +252,14 @@ INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11) #define INNER_SAVE_m4n16 \ - "movq %3,%10;"\ - INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\ - INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\ - INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\ - INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15) + INNER_SAVE_m4n8\ + INNER_TRANS_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ + INNER_STORE_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15) #define INNER_SAVE_m4n24 \ - "movq %3,%10;"\ - INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\ - INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\ - INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\ - INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\ - INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\ - INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19) + INNER_SAVE_m4n16\ + INNER_TRANS_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)\ + INNER_STORE_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19) #define INNER_SAVE_m8n8 \ "movq %3,%10;"\ @@ -271,20 +267,14 @@ INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15) #define INNER_SAVE_m8n16 \ - "movq %3,%10;"\ - INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\ - INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\ - INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\ - INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23) + INNER_SAVE_m8n8\ + INNER_TRANS_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)\ + INNER_STORE_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23) #define INNER_SAVE_m8n24 \ - "movq %3,%10;"\ - INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\ - INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\ - INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\ - INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\ - INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\ - INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31) + INNER_SAVE_m8n16\ + INNER_TRANS_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)\ + INNER_STORE_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31) #define COMPUTE_n8 {\ b_pref = packed_b_pointer + 8 * K;\ @@ -327,7 +317,7 @@ "shlq $3,%4;addq %4,%3;shrq $3,%4;"\ :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ - ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ + ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } #define COMPUTE_n16 {\ @@ -372,7 +362,7 @@ "leaq (%1,%%r12,4),%1;"\ :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ - ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ + ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } @@ -417,9 +407,9 @@ "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\ "leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\ :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ - "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ - ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\ - "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ + "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)::\ + "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18",\ + "zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8