diff --git a/Makefile.rule b/Makefile.rule index 62bf63df4..12734464b 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -60,6 +60,13 @@ VERSION = 0.3.0.dev # automatically detected by the the script. # NUM_THREADS = 24 +# If you have enabled USE_OPENMP and your application would call +# OpenBLAS's calculation API from multi threads, please comment it in. +# This flag defines how many instances of OpenBLAS's calculation API can +# actually run in parallel. If more threads call OpenBLAS's calculation API, +# they need to wait for the preceding API calls to finish or risk data corruption. +# NUM_PARALLEL = 2 + # if you don't need to install the static library, please comment it in. # NO_STATIC = 1 diff --git a/Makefile.system b/Makefile.system index 9a3319412..7bfac1fa8 100644 --- a/Makefile.system +++ b/Makefile.system @@ -184,6 +184,10 @@ endif endif +ifndef NUM_PARALLEL +NUM_PARALLEL = 1 +endif + ifndef NUM_THREADS NUM_THREADS = $(NUM_CORES) endif @@ -966,6 +970,8 @@ endif CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS) +CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL) + ifdef USE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 endif diff --git a/cmake/system.cmake b/cmake/system.cmake index 3fdd9390c..645895671 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -96,6 +96,10 @@ if (NOT CMAKE_CROSSCOMPILING) endif() +if (NOT DEFINED NUM_PARALLEL) + set(NUM_PARALLEL 1) +endif() + if (NOT DEFINED NUM_THREADS) if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0) # HT? @@ -224,6 +228,8 @@ endif () set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}") +set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}") + if (USE_SIMPLE_THREADED_LEVEL3) set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3") endif () diff --git a/common.h b/common.h index 5a599a5af..86c33b2fd 100644 --- a/common.h +++ b/common.h @@ -179,7 +179,7 @@ extern "C" { #define ALLOCA_ALIGN 63UL -#define NUM_BUFFERS (MAX_CPU_NUMBER * 2) +#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) #ifdef NEEDBUNDERSCORE #define BLASFUNC(FUNC) FUNC##_ diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 8d62a8125..868db3b1d 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -36,6 +36,13 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ +#if _STDC_VERSION__ >= 201112L +#ifndef _Atomic +#define _Atomic volatile +#endif +#include +#endif +#include #include #include //#include @@ -49,11 +56,16 @@ int blas_server_avail = 0; -static void * blas_thread_buffer[MAX_CPU_NUMBER]; +static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; +#if _STDC_VERSION__ >= 201112L +static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; +#else +static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; +#endif void goto_set_num_threads(int num_threads) { - int i=0; + int i=0, j=0; if (num_threads < 1) num_threads = blas_num_threads; @@ -68,15 +80,17 @@ void goto_set_num_threads(int num_threads) { omp_set_num_threads(blas_cpu_number); //adjust buffer for each thread - for(i=0; i mode & BLAS_PTHREAD) == 0)) { pos = omp_get_thread_num(); - buffer = blas_thread_buffer[pos]; + buffer = blas_thread_buffer[buf_index][pos]; //fallback if(buffer==NULL) { @@ -291,7 +309,7 @@ static void exec_threads(blas_queue_t *queue){ int exec_blas(BLASLONG num, blas_queue_t *queue){ - BLASLONG i; + BLASLONG i, buf_index; if ((num <= 0) || (queue == NULL)) return 0; @@ -302,6 +320,23 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ } #endif + while(true) { + for(i=0; i < MAX_PARALLEL_NUMBER; i++) { +#if _STDC_VERSION__ >= 201112L + _Bool inuse = false; + if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) { +#else + if(blas_buffer_inuse[i] == false) { + blas_buffer_inuse[i] = true; +#endif + buf_index = i; + break; + } + } + if(i != MAX_PARALLEL_NUMBER) + break; + } + #pragma omp parallel for schedule(static) for (i = 0; i < num; i ++) { @@ -309,9 +344,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ queue[i].position = i; #endif - exec_threads(&queue[i]); + exec_threads(&queue[i], buf_index); } +#if _STDC_VERSION__ >= 201112L + atomic_store(&blas_buffer_inuse[buf_index], false); +#else + blas_buffer_inuse[buf_index] = false; +#endif + return 0; }