From b7315f8401089a91ae382b87be7e2683745828da Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 16 Oct 2019 19:23:36 +0800 Subject: [PATCH] Add files via upload --- kernel/x86_64/dgemm_kernel_8x8_skylakex.c | 75 ++++++++--------------- 1 file changed, 26 insertions(+), 49 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c index 1db955776..b8b3234d1 100644 --- a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c @@ -25,8 +25,8 @@ "vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;" #define INNER_KERNEL_k1m1n16 \ - "prefetcht0 384(%1); prefetcht0 448(%1);"\ - "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd 64(%1),%%zmm6; addq $128,%1;"\ + "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,1);"\ + "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,1),%%zmm6; addq $64,%1;"\ "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;" #define INNER_KERNEL_k1m2n16 \ @@ -46,8 +46,8 @@ "vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;" #define INNER_KERNEL_k1m1n24 \ - "prefetcht0 384(%1); prefetcht0 448(%1); prefetcht0 512(%1);"\ - "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd 64(%1),%%zmm6; vmovupd 128(%1),%%zmm7; addq $192,%1;"\ + "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,1); prefetcht0 128(%1,%%r12,2);"\ + "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,1),%%zmm6; vmovupd (%1,%%r12,2),%%zmm7; addq $64,%1;"\ "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;" #define INNER_KERNEL_k1m2n24 \ @@ -292,13 +292,13 @@ #define COMPUTE_n8 {\ __asm__ __volatile__(\ - "movq %8,%%r14;movq %2,%%r13;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\ "cmpq $8,%8; jb 42222f;"\ "42221:\n\t"\ INNER_INIT_m8n8\ INNER_KERNELm8(8)\ INNER_SAVE_m8n8\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $64,%3;"\ "subq $8,%8; cmpq $8,%8; jnb 42221b;"\ "42222:\n\t"\ @@ -306,7 +306,7 @@ INNER_INIT_m4n8\ INNER_KERNELm4(8)\ INNER_SAVE_m4n8\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $32,%3;"\ "subq $4,%8;"\ "42223:\n\t"\ @@ -314,7 +314,7 @@ INNER_INIT_m2n8\ INNER_KERNELm2(8)\ INNER_SAVE_m2n8\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "addq $16,%3;"\ "subq $2,%8;"\ "42224:\n\t"\ @@ -322,7 +322,7 @@ INNER_INIT_m1n8\ INNER_KERNELm1(8)\ INNER_SAVE_m1n8\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "addq $8,%3;"\ "42225:\n\t"\ "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ @@ -333,13 +333,13 @@ } #define COMPUTE_n16 {\ __asm__ __volatile__(\ - "movq %8,%%r14;movq %2,%%r13;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\ "cmpq $8,%8; jb 32222f;"\ "32221:\n\t"\ INNER_INIT_m8n16\ INNER_KERNELm8(16)\ INNER_SAVE_m8n16\ - "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\ "subq $8,%8; cmpq $8,%8; jnb 32221b;"\ "32222:\n\t"\ @@ -347,7 +347,7 @@ INNER_INIT_m4n16\ INNER_KERNELm4(16)\ INNER_SAVE_m4n16\ - "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\ "subq $4,%8;"\ "32223:\n\t"\ @@ -355,7 +355,7 @@ INNER_INIT_m2n16\ INNER_KERNELm2(16)\ INNER_SAVE_m2n16\ - "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $16,%3;"\ "subq $2,%8;"\ "32224:\n\t"\ @@ -363,28 +363,26 @@ INNER_INIT_m1n16\ INNER_KERNELm1(16)\ INNER_SAVE_m1n16\ - "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $8,%3;"\ "32225:\n\t"\ "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ "shlq $4,%4;addq %4,%3;shrq $4,%4;"\ - :"+r"(a_block_pointer),"+r"(b_scratch),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\ + "leaq (%1,%%r12,2),%1;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\ ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ - "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r13","r14");\ + "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } #define COMPUTE_n24 {\ __asm__ __volatile__(\ - "movq %8,%%r14;movq %9,%%r15;movq %2,%%r13;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\ "cmpq $8,%8; jb 22222f;"\ "22221:\n\t"\ INNER_INIT_m8n24\ - "prefetcht2 (%%r15); prefetcht2 64(%%r15);"\ INNER_KERNELm8(24)\ - "prefetcht2 128(%%r15); prefetcht2 192(%%r15);"\ INNER_SAVE_m8n24\ - "prefetcht2 256(%%r15); prefetcht2 320(%%r15); addq $384,%%r15;"\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\ "subq $8,%8; cmpq $8,%8; jnb 22221b;"\ "22222:\n\t"\ @@ -392,7 +390,7 @@ INNER_INIT_m4n24\ INNER_KERNELm4(24)\ INNER_SAVE_m4n24\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\ "subq $4,%8;"\ "22223:\n\t"\ @@ -400,7 +398,7 @@ INNER_INIT_m2n24\ INNER_KERNELm2(24)\ INNER_SAVE_m2n24\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $16,%3;"\ "subq $2,%8;"\ "22224:\n\t"\ @@ -408,19 +406,19 @@ INNER_INIT_m1n24\ INNER_KERNELm1(24)\ INNER_SAVE_m1n24\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $8,%3;"\ "22225:\n\t"\ "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\ - :"+r"(a_block_pointer),"+r"(b_scratch),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),\ - "+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(packed_b_pointer)\ + "leaq (%1,%%r12,2),%1; addq %%r12,%1;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\ ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\ - "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r13","r14","r15");\ + "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } -static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8 +static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8 //perform C += A B if(k==0 || m==0 || ndiv8==0) return; int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double); @@ -429,38 +427,17 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG double *c_pointer = c; __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033; BLASLONG ndiv8_count; - double *b_scratch; - posix_memalign(&b_scratch,64,192*k); double *packed_b_pointer = packed_b; a_block_pointer = packed_a; for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){ - __asm__ __volatile__ ( - "testq %2,%2; jz 100002f;movq %2,%%r13;shlq $6,%%r13;" - "100001:\n\t" - "vmovupd (%0),%%zmm5; vmovupd (%0,%%r13,1),%%zmm6; vmovupd (%0,%%r13,2),%%zmm7; addq $64,%0;" - "vmovupd %%zmm5,(%1); vmovupd %%zmm6,64(%1); vmovupd %%zmm7,128(%1); addq $192,%1;" - "decq %2; testq %2,%2; jnz 100001b;" - "100002:\n\t" - "movq %%r13,%2;shrq $6,%2;leaq (%0,%%r13,2),%0;subq %%r13,%1;subq %%r13,%1;subq %%r13,%1;" - :"+r"(packed_b_pointer),"+r"(b_scratch),"+r"(K)::"r13","cc","memory","zmm5","zmm6","zmm7"); COMPUTE_n24 } for(;ndiv8_count>1;ndiv8_count-=2){ - __asm__ __volatile__ ( - "testq %2,%2; jz 1000002f;movq %2,%%r13;shlq $6,%%r13;" - "1000001:\n\t" - "vmovupd (%0),%%zmm5; vmovupd (%0,%%r13,1),%%zmm6; addq $64,%0;" - "vmovupd %%zmm5,(%1); vmovupd %%zmm6,64(%1); addq $128,%1;" - "decq %2; testq %2,%2; jnz 1000001b;" - "1000002:\n\t" - "movq %%r13,%2;shrq $6,%2;leaq (%0,%%r13,1),%0;subq %%r13,%1;subq %%r13,%1;" - :"+r"(packed_b_pointer),"+r"(b_scratch),"+r"(K)::"r13","cc","memory","zmm5","zmm6"); COMPUTE_n16 } if(ndiv8_count>0){ COMPUTE_n8 } - free(b_scratch);b_scratch=NULL; } /* __m512d accumulators: zc1-zc4; temporary variables: za1,zb1-zb2 */