diff --git a/Makefile.rule b/Makefile.rule index 5c03d0195..649aabe70 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -60,6 +60,14 @@ VERSION = 0.3.1.dev # This flag is always set for POWER8. Don't modify the flag # USE_OPENMP = 1 +# The OpenMP scheduler to use - by default this is "static" and you +# will normally not want to change this unless you know that your main +# workload will involve tasks that have highly unbalanced running times +# for individual threads. Changing away from "static" may also adversely +# affect memory access locality in NUMA systems. Setting to "runtime" will +# allow you to select the scheduler from the environment variable OMP_SCHEDULE +# CCOMMON_OPT += -DOMP_SCHED=dynamic + # You can define maximum number of threads. Basically it should be # less than actual number of cores. If you don't specify one, it's # automatically detected by the the script. @@ -156,8 +164,11 @@ NO_AFFINITY = 1 # CONSISTENT_FPCSR = 1 # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute -# with single thread. You can use this flag to avoid the overhead of multi-threading -# in small matrix sizes. The default value is 4. +# with single thread. (Actually in recent versions this is a factor proportional to the +# number of floating point operations necessary for the given problem size, no longer +# an individual dimension). You can use this setting to avoid the overhead of multi- +# threading in small matrix sizes. The default value is 4, but values as high as 50 have +# been reported to be optimal for certain workloads (50 is the recommended value for Julia). # GEMM_MULTITHREAD_THRESHOLD = 4 # If you need santy check by comparing reference BLAS. It'll be very diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index fccdb4320..4255852c8 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -48,6 +48,10 @@ #else +#ifndef OMP_SCHED +#define OMP_SCHED static +#endif + int blas_server_avail = 0; static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; @@ -331,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ break; } -#pragma omp parallel for schedule(static) +#pragma omp parallel for schedule(OMP_SCHED) for (i = 0; i < num; i ++) { #ifndef USE_SIMPLE_THREADED_LEVEL3