Improve performance of GEMM for small matrices when SMP is defined.

Always checking num_cpu_avail() regardless of whether threading will actually
be used adds noticeable overhead for small matrices.  Most other uses of
num_cpu_avail() do so only if threading will be used, so do the same here.
This commit is contained in:
Craig Donner 2018-06-07 14:54:42 +01:00
parent 6adc4b7b36
commit 4b0de7690d
1 changed files with 5 additions and 3 deletions

View File

@ -411,20 +411,22 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
mode |= (transa << BLAS_TRANSA_SHIFT);
mode |= (transb << BLAS_TRANSB_SHIFT);
nthreads_max = num_cpu_avail(3);
nthreads_avail = nthreads_max;
#ifndef COMPLEX
MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1;
else
nthreads_max = num_cpu_avail(3);
#else
MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1;
else
nthreads_max = num_cpu_avail(3);
#endif
args.common = NULL;
nthreads_avail = nthreads_max;
if ( nthreads_max > nthreads_avail )
args.nthreads = nthreads_avail;
else