Improve performance of GEMM for small matrices when SMP is defined.
Always checking num_cpu_avail() regardless of whether threading will actually be used adds noticeable overhead for small matrices. Most other uses of num_cpu_avail() do so only if threading will be used, so do the same here.
This commit is contained in:
parent
6adc4b7b36
commit
4b0de7690d
|
@ -411,20 +411,22 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
||||||
mode |= (transa << BLAS_TRANSA_SHIFT);
|
mode |= (transa << BLAS_TRANSA_SHIFT);
|
||||||
mode |= (transb << BLAS_TRANSB_SHIFT);
|
mode |= (transb << BLAS_TRANSB_SHIFT);
|
||||||
|
|
||||||
nthreads_max = num_cpu_avail(3);
|
|
||||||
nthreads_avail = nthreads_max;
|
|
||||||
|
|
||||||
#ifndef COMPLEX
|
#ifndef COMPLEX
|
||||||
MNK = (double) args.m * (double) args.n * (double) args.k;
|
MNK = (double) args.m * (double) args.n * (double) args.k;
|
||||||
if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
||||||
nthreads_max = 1;
|
nthreads_max = 1;
|
||||||
|
else
|
||||||
|
nthreads_max = num_cpu_avail(3);
|
||||||
#else
|
#else
|
||||||
MNK = (double) args.m * (double) args.n * (double) args.k;
|
MNK = (double) args.m * (double) args.n * (double) args.k;
|
||||||
if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
||||||
nthreads_max = 1;
|
nthreads_max = 1;
|
||||||
|
else
|
||||||
|
nthreads_max = num_cpu_avail(3);
|
||||||
#endif
|
#endif
|
||||||
args.common = NULL;
|
args.common = NULL;
|
||||||
|
|
||||||
|
nthreads_avail = nthreads_max;
|
||||||
if ( nthreads_max > nthreads_avail )
|
if ( nthreads_max > nthreads_avail )
|
||||||
args.nthreads = nthreads_avail;
|
args.nthreads = nthreads_avail;
|
||||||
else
|
else
|
||||||
|
|
Loading…
Reference in New Issue