diff --git a/Makefile.rule b/Makefile.rule index a73a9553c..f7d60b052 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -88,6 +88,11 @@ VERSION = 0.1alpha2.5 # If you need to synchronize FP CSR between threads (for x86/x86_64 only). # CONSISTENT_FPCSR = 1 +# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute +# with single thread. You can use this flag to avoid the overhead of multi-threading +# in small matrix sizes. The default value is 4. +# GEMM_MULTITHREAD_THRESHOLD = 4 + # If you need santy check by comparing reference BLAS. It'll be very # slow (Not implemented yet). # SANITY_CHECK = 1 diff --git a/Makefile.system b/Makefile.system index 8ec93031e..b8b9ba837 100644 --- a/Makefile.system +++ b/Makefile.system @@ -40,6 +40,11 @@ ifdef INTERFACE64 GETARCH_FLAGS += -DUSE64BITINT endif +ifndef GEMM_MULTITHREAD_THRESHOLD +GEMM_MULTITHREAD_THRESHOLD=4 +endif +GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) + # This operation is expensive, so execution should be once. ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 diff --git a/getarch_2nd.c b/getarch_2nd.c index 018f08d31..5339af442 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -34,6 +34,7 @@ int main(int argc, char **argv) { #ifdef USE64BITINT printf("#define USE64BITINT\n"); #endif + printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", GEMM_MULTITHREAD_THRESHOLD); } return 0; diff --git a/interface/gemm.c b/interface/gemm.c index 7919f822e..28cf5372d 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -397,8 +397,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS mode |= (transb << BLAS_TRANSB_SHIFT); args.common = NULL; - args.nthreads = num_cpu_avail(3); + if(args.m <= GEMM_MULTITHREAD_THRESHOLD || args.n <= GEMM_MULTITHREAD_THRESHOLD + || args.k <=GEMM_MULTITHREAD_THRESHOLD){ + args.nthreads = 1; + }else{ + args.nthreads = num_cpu_avail(3); + } if (args.nthreads == 1) { #endif