diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index cfbff7554..bf558447e 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -365,12 +365,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Split local region of B into parts */ for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){ min_jj = MIN(n_to, js + div_n) - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif /* Copy part of local region of B into workspace */ START_RPCC(); OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,