Add build-time option for OMP scheduler; document MULTITHREAD_THRESHOLD range (#1620)
* Allow choosing the OpenMP scheduler and add range hint for GEMM_MULTITHREAD_THRESHOLD * Amended description of GEMM_MULTITHREAD_THRESHOLD to reflect #742 making it track floating point operations rather than matrix size
This commit is contained in:
parent
12603b7dbb
commit
47bf0dba8f
|
@ -60,6 +60,14 @@ VERSION = 0.3.1.dev
|
||||||
# This flag is always set for POWER8. Don't modify the flag
|
# This flag is always set for POWER8. Don't modify the flag
|
||||||
# USE_OPENMP = 1
|
# USE_OPENMP = 1
|
||||||
|
|
||||||
|
# The OpenMP scheduler to use - by default this is "static" and you
|
||||||
|
# will normally not want to change this unless you know that your main
|
||||||
|
# workload will involve tasks that have highly unbalanced running times
|
||||||
|
# for individual threads. Changing away from "static" may also adversely
|
||||||
|
# affect memory access locality in NUMA systems. Setting to "runtime" will
|
||||||
|
# allow you to select the scheduler from the environment variable OMP_SCHEDULE
|
||||||
|
# CCOMMON_OPT += -DOMP_SCHED=dynamic
|
||||||
|
|
||||||
# You can define maximum number of threads. Basically it should be
|
# You can define maximum number of threads. Basically it should be
|
||||||
# less than actual number of cores. If you don't specify one, it's
|
# less than actual number of cores. If you don't specify one, it's
|
||||||
# automatically detected by the the script.
|
# automatically detected by the the script.
|
||||||
|
@ -156,8 +164,11 @@ NO_AFFINITY = 1
|
||||||
# CONSISTENT_FPCSR = 1
|
# CONSISTENT_FPCSR = 1
|
||||||
|
|
||||||
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
|
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
|
||||||
# with single thread. You can use this flag to avoid the overhead of multi-threading
|
# with single thread. (Actually in recent versions this is a factor proportional to the
|
||||||
# in small matrix sizes. The default value is 4.
|
# number of floating point operations necessary for the given problem size, no longer
|
||||||
|
# an individual dimension). You can use this setting to avoid the overhead of multi-
|
||||||
|
# threading in small matrix sizes. The default value is 4, but values as high as 50 have
|
||||||
|
# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
|
||||||
# GEMM_MULTITHREAD_THRESHOLD = 4
|
# GEMM_MULTITHREAD_THRESHOLD = 4
|
||||||
|
|
||||||
# If you need santy check by comparing reference BLAS. It'll be very
|
# If you need santy check by comparing reference BLAS. It'll be very
|
||||||
|
|
|
@ -48,6 +48,10 @@
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
#ifndef OMP_SCHED
|
||||||
|
#define OMP_SCHED static
|
||||||
|
#endif
|
||||||
|
|
||||||
int blas_server_avail = 0;
|
int blas_server_avail = 0;
|
||||||
|
|
||||||
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
|
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
|
||||||
|
@ -331,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(OMP_SCHED)
|
||||||
for (i = 0; i < num; i ++) {
|
for (i = 0; i < num; i ++) {
|
||||||
|
|
||||||
#ifndef USE_SIMPLE_THREADED_LEVEL3
|
#ifndef USE_SIMPLE_THREADED_LEVEL3
|
||||||
|
|
Loading…
Reference in New Issue