Adding performance patch for trmm, just like #2836
This commit is contained in:
parent
91c84e1c01
commit
dfeca46098
|
@ -139,7 +139,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
#else
|
#else
|
||||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
#endif
|
#endif
|
||||||
|
@ -209,7 +209,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
#else
|
#else
|
||||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
#endif
|
#endif
|
||||||
|
@ -304,7 +304,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
#else
|
#else
|
||||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
#endif
|
#endif
|
||||||
|
@ -374,7 +374,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
#else
|
#else
|
||||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -126,7 +126,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
#else
|
#else
|
||||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
#endif
|
#endif
|
||||||
|
@ -150,7 +150,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
#else
|
#else
|
||||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
#endif
|
#endif
|
||||||
|
@ -207,7 +207,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
#else
|
#else
|
||||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
#endif
|
#endif
|
||||||
|
@ -262,7 +262,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
#else
|
#else
|
||||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
#endif
|
#endif
|
||||||
|
@ -287,7 +287,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
#else
|
#else
|
||||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
#endif
|
#endif
|
||||||
|
@ -348,7 +348,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
#else
|
#else
|
||||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Reference in New Issue