Improve performance of GEMM for small matrices when SMP is defined.

Always checking num_cpu_avail() regardless of whether threading will actually
be used adds noticeable overhead for small matrices.  Most other uses of
num_cpu_avail() do so only if threading will be used, so do the same here.
This commit is contained in:
Craig Donner 2018-06-07 14:54:42 +01:00
parent 6adc4b7b36
commit 66316b9f4c
2 changed files with 8 additions and 22 deletions

View File

@ -44,6 +44,7 @@
#endif #endif
#ifndef COMPLEX #ifndef COMPLEX
#define SMP_THRESHOLD_MIN 65536.0
#ifdef XDOUBLE #ifdef XDOUBLE
#define ERROR_NAME "QGEMM " #define ERROR_NAME "QGEMM "
#elif defined(DOUBLE) #elif defined(DOUBLE)
@ -52,6 +53,7 @@
#define ERROR_NAME "SGEMM " #define ERROR_NAME "SGEMM "
#endif #endif
#else #else
#define SMP_THRESHOLD_MIN 8192.0
#ifndef GEMM3M #ifndef GEMM3M
#ifdef XDOUBLE #ifdef XDOUBLE
#define ERROR_NAME "XGEMM " #define ERROR_NAME "XGEMM "
@ -121,8 +123,6 @@ void NAME(char *TRANSA, char *TRANSB,
FLOAT *sa, *sb; FLOAT *sa, *sb;
#ifdef SMP #ifdef SMP
int nthreads_max;
int nthreads_avail;
double MNK; double MNK;
#ifndef COMPLEX #ifndef COMPLEX
#ifdef XDOUBLE #ifdef XDOUBLE
@ -245,8 +245,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
XFLOAT *sa, *sb; XFLOAT *sa, *sb;
#ifdef SMP #ifdef SMP
int nthreads_max;
int nthreads_avail;
double MNK; double MNK;
#ifndef COMPLEX #ifndef COMPLEX
#ifdef XDOUBLE #ifdef XDOUBLE
@ -411,25 +409,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
mode |= (transa << BLAS_TRANSA_SHIFT); mode |= (transa << BLAS_TRANSA_SHIFT);
mode |= (transb << BLAS_TRANSB_SHIFT); mode |= (transb << BLAS_TRANSB_SHIFT);
nthreads_max = num_cpu_avail(3);
nthreads_avail = nthreads_max;
#ifndef COMPLEX
MNK = (double) args.m * (double) args.n * (double) args.k; MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1; args.nthreads = 1;
#else
MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1;
#endif
args.common = NULL;
if ( nthreads_max > nthreads_avail )
args.nthreads = nthreads_avail;
else else
args.nthreads = nthreads_max; args.nthreads = num_cpu_avail(3);
args.common = NULL;
if (args.nthreads == 1) { if (args.nthreads == 1) {
#endif #endif

View File

@ -366,12 +366,13 @@ void CNAME(enum CBLAS_ORDER order,
mode |= (trans << BLAS_TRANSA_SHIFT); mode |= (trans << BLAS_TRANSA_SHIFT);
mode |= (side << BLAS_RSIDE_SHIFT); mode |= (side << BLAS_RSIDE_SHIFT);
args.nthreads = num_cpu_avail(3);
if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD ) if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD )
args.nthreads = 1; args.nthreads = 1;
else else
if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD ) if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD )
args.nthreads = 1; args.nthreads = 1;
else
args.nthreads = num_cpu_avail(3);
if (args.nthreads == 1) { if (args.nthreads == 1) {