Improve performance of GEMM for small matrices when SMP is defined.
Always checking num_cpu_avail() regardless of whether threading will actually be used adds noticeable overhead for small matrices. Most other uses of num_cpu_avail() do so only if threading will be used, so do the same here.
This commit is contained in:
parent
6adc4b7b36
commit
66316b9f4c
|
@ -44,6 +44,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef COMPLEX
|
#ifndef COMPLEX
|
||||||
|
#define SMP_THRESHOLD_MIN 65536.0
|
||||||
#ifdef XDOUBLE
|
#ifdef XDOUBLE
|
||||||
#define ERROR_NAME "QGEMM "
|
#define ERROR_NAME "QGEMM "
|
||||||
#elif defined(DOUBLE)
|
#elif defined(DOUBLE)
|
||||||
|
@ -52,6 +53,7 @@
|
||||||
#define ERROR_NAME "SGEMM "
|
#define ERROR_NAME "SGEMM "
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
|
#define SMP_THRESHOLD_MIN 8192.0
|
||||||
#ifndef GEMM3M
|
#ifndef GEMM3M
|
||||||
#ifdef XDOUBLE
|
#ifdef XDOUBLE
|
||||||
#define ERROR_NAME "XGEMM "
|
#define ERROR_NAME "XGEMM "
|
||||||
|
@ -121,8 +123,6 @@ void NAME(char *TRANSA, char *TRANSB,
|
||||||
FLOAT *sa, *sb;
|
FLOAT *sa, *sb;
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
int nthreads_max;
|
|
||||||
int nthreads_avail;
|
|
||||||
double MNK;
|
double MNK;
|
||||||
#ifndef COMPLEX
|
#ifndef COMPLEX
|
||||||
#ifdef XDOUBLE
|
#ifdef XDOUBLE
|
||||||
|
@ -245,8 +245,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
||||||
XFLOAT *sa, *sb;
|
XFLOAT *sa, *sb;
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
int nthreads_max;
|
|
||||||
int nthreads_avail;
|
|
||||||
double MNK;
|
double MNK;
|
||||||
#ifndef COMPLEX
|
#ifndef COMPLEX
|
||||||
#ifdef XDOUBLE
|
#ifdef XDOUBLE
|
||||||
|
@ -411,25 +409,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
||||||
mode |= (transa << BLAS_TRANSA_SHIFT);
|
mode |= (transa << BLAS_TRANSA_SHIFT);
|
||||||
mode |= (transb << BLAS_TRANSB_SHIFT);
|
mode |= (transb << BLAS_TRANSB_SHIFT);
|
||||||
|
|
||||||
nthreads_max = num_cpu_avail(3);
|
|
||||||
nthreads_avail = nthreads_max;
|
|
||||||
|
|
||||||
#ifndef COMPLEX
|
|
||||||
MNK = (double) args.m * (double) args.n * (double) args.k;
|
MNK = (double) args.m * (double) args.n * (double) args.k;
|
||||||
if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
||||||
nthreads_max = 1;
|
args.nthreads = 1;
|
||||||
#else
|
|
||||||
MNK = (double) args.m * (double) args.n * (double) args.k;
|
|
||||||
if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
|
||||||
nthreads_max = 1;
|
|
||||||
#endif
|
|
||||||
args.common = NULL;
|
|
||||||
|
|
||||||
if ( nthreads_max > nthreads_avail )
|
|
||||||
args.nthreads = nthreads_avail;
|
|
||||||
else
|
else
|
||||||
args.nthreads = nthreads_max;
|
args.nthreads = num_cpu_avail(3);
|
||||||
|
args.common = NULL;
|
||||||
|
|
||||||
if (args.nthreads == 1) {
|
if (args.nthreads == 1) {
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -366,12 +366,13 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
mode |= (trans << BLAS_TRANSA_SHIFT);
|
mode |= (trans << BLAS_TRANSA_SHIFT);
|
||||||
mode |= (side << BLAS_RSIDE_SHIFT);
|
mode |= (side << BLAS_RSIDE_SHIFT);
|
||||||
|
|
||||||
args.nthreads = num_cpu_avail(3);
|
|
||||||
if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD )
|
if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD )
|
||||||
args.nthreads = 1;
|
args.nthreads = 1;
|
||||||
else
|
else
|
||||||
if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD )
|
if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD )
|
||||||
args.nthreads = 1;
|
args.nthreads = 1;
|
||||||
|
else
|
||||||
|
args.nthreads = num_cpu_avail(3);
|
||||||
|
|
||||||
|
|
||||||
if (args.nthreads == 1) {
|
if (args.nthreads == 1) {
|
||||||
|
|
Loading…
Reference in New Issue