Merge pull request #1536 from WestAlgo/develop
Fix race condition in blas_server_omp.c
This commit is contained in:
commit
50acc40613
|
@ -60,6 +60,13 @@ VERSION = 0.3.0.dev
|
|||
# automatically detected by the the script.
|
||||
# NUM_THREADS = 24
|
||||
|
||||
# If you have enabled USE_OPENMP and your application would call
|
||||
# OpenBLAS's calculation API from multi threads, please comment it in.
|
||||
# This flag defines how many instances of OpenBLAS's calculation API can
|
||||
# actually run in parallel. If more threads call OpenBLAS's calculation API,
|
||||
# they need to wait for the preceding API calls to finish or risk data corruption.
|
||||
# NUM_PARALLEL = 2
|
||||
|
||||
# if you don't need to install the static library, please comment it in.
|
||||
# NO_STATIC = 1
|
||||
|
||||
|
|
|
@ -184,6 +184,10 @@ endif
|
|||
|
||||
endif
|
||||
|
||||
ifndef NUM_PARALLEL
|
||||
NUM_PARALLEL = 1
|
||||
endif
|
||||
|
||||
ifndef NUM_THREADS
|
||||
NUM_THREADS = $(NUM_CORES)
|
||||
endif
|
||||
|
@ -966,6 +970,8 @@ endif
|
|||
|
||||
CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS)
|
||||
|
||||
CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL)
|
||||
|
||||
ifdef USE_SIMPLE_THREADED_LEVEL3
|
||||
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
|
||||
endif
|
||||
|
|
|
@ -96,6 +96,10 @@ if (NOT CMAKE_CROSSCOMPILING)
|
|||
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NUM_PARALLEL)
|
||||
set(NUM_PARALLEL 1)
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NUM_THREADS)
|
||||
if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0)
|
||||
# HT?
|
||||
|
@ -224,6 +228,8 @@ endif ()
|
|||
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}")
|
||||
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}")
|
||||
|
||||
if (USE_SIMPLE_THREADED_LEVEL3)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3")
|
||||
endif ()
|
||||
|
|
2
common.h
2
common.h
|
@ -179,7 +179,7 @@ extern "C" {
|
|||
|
||||
#define ALLOCA_ALIGN 63UL
|
||||
|
||||
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2)
|
||||
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
|
||||
|
||||
#ifdef NEEDBUNDERSCORE
|
||||
#define BLASFUNC(FUNC) FUNC##_
|
||||
|
|
|
@ -36,6 +36,13 @@
|
|||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#if _STDC_VERSION__ >= 201112L
|
||||
#ifndef _Atomic
|
||||
#define _Atomic volatile
|
||||
#endif
|
||||
#include <stdatomic.h>
|
||||
#endif
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
//#include <sys/mman.h>
|
||||
|
@ -49,11 +56,16 @@
|
|||
|
||||
int blas_server_avail = 0;
|
||||
|
||||
static void * blas_thread_buffer[MAX_CPU_NUMBER];
|
||||
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
|
||||
#if _STDC_VERSION__ >= 201112L
|
||||
static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
|
||||
#else
|
||||
static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
|
||||
#endif
|
||||
|
||||
void goto_set_num_threads(int num_threads) {
|
||||
|
||||
int i=0;
|
||||
int i=0, j=0;
|
||||
|
||||
if (num_threads < 1) num_threads = blas_num_threads;
|
||||
|
||||
|
@ -68,15 +80,17 @@ void goto_set_num_threads(int num_threads) {
|
|||
omp_set_num_threads(blas_cpu_number);
|
||||
|
||||
//adjust buffer for each thread
|
||||
for(i=0; i<blas_cpu_number; i++){
|
||||
if(blas_thread_buffer[i]==NULL){
|
||||
blas_thread_buffer[i]=blas_memory_alloc(2);
|
||||
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
|
||||
for(j=0; j<blas_cpu_number; j++){
|
||||
if(blas_thread_buffer[i][j]==NULL){
|
||||
blas_thread_buffer[i][j]=blas_memory_alloc(2);
|
||||
}
|
||||
}
|
||||
for(; i<MAX_CPU_NUMBER; i++){
|
||||
if(blas_thread_buffer[i]!=NULL){
|
||||
blas_memory_free(blas_thread_buffer[i]);
|
||||
blas_thread_buffer[i]=NULL;
|
||||
for(; j<MAX_CPU_NUMBER; j++){
|
||||
if(blas_thread_buffer[i][j]!=NULL){
|
||||
blas_memory_free(blas_thread_buffer[i][j]);
|
||||
blas_thread_buffer[i][j]=NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
#if defined(ARCH_MIPS64)
|
||||
|
@ -92,30 +106,34 @@ void openblas_set_num_threads(int num_threads) {
|
|||
|
||||
int blas_thread_init(void){
|
||||
|
||||
int i=0;
|
||||
int i=0, j=0;
|
||||
|
||||
blas_get_cpu_number();
|
||||
|
||||
blas_server_avail = 1;
|
||||
|
||||
for(i=0; i<blas_num_threads; i++){
|
||||
blas_thread_buffer[i]=blas_memory_alloc(2);
|
||||
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
|
||||
for(j=0; j<blas_num_threads; j++){
|
||||
blas_thread_buffer[i][j]=blas_memory_alloc(2);
|
||||
}
|
||||
for(; j<MAX_CPU_NUMBER; j++){
|
||||
blas_thread_buffer[i][j]=NULL;
|
||||
}
|
||||
for(; i<MAX_CPU_NUMBER; i++){
|
||||
blas_thread_buffer[i]=NULL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int BLASFUNC(blas_thread_shutdown)(void){
|
||||
int i=0;
|
||||
int i=0, j=0;
|
||||
blas_server_avail = 0;
|
||||
|
||||
for(i=0; i<MAX_CPU_NUMBER; i++){
|
||||
if(blas_thread_buffer[i]!=NULL){
|
||||
blas_memory_free(blas_thread_buffer[i]);
|
||||
blas_thread_buffer[i]=NULL;
|
||||
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
|
||||
for(j=0; j<MAX_CPU_NUMBER; j++){
|
||||
if(blas_thread_buffer[i][j]!=NULL){
|
||||
blas_memory_free(blas_thread_buffer[i][j]);
|
||||
blas_thread_buffer[i][j]=NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -206,7 +224,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
}
|
||||
}
|
||||
|
||||
static void exec_threads(blas_queue_t *queue){
|
||||
static void exec_threads(blas_queue_t *queue, int buf_index){
|
||||
|
||||
void *buffer, *sa, *sb;
|
||||
int pos=0, release_flag=0;
|
||||
|
@ -223,7 +241,7 @@ static void exec_threads(blas_queue_t *queue){
|
|||
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
|
||||
|
||||
pos = omp_get_thread_num();
|
||||
buffer = blas_thread_buffer[pos];
|
||||
buffer = blas_thread_buffer[buf_index][pos];
|
||||
|
||||
//fallback
|
||||
if(buffer==NULL) {
|
||||
|
@ -291,7 +309,7 @@ static void exec_threads(blas_queue_t *queue){
|
|||
|
||||
int exec_blas(BLASLONG num, blas_queue_t *queue){
|
||||
|
||||
BLASLONG i;
|
||||
BLASLONG i, buf_index;
|
||||
|
||||
if ((num <= 0) || (queue == NULL)) return 0;
|
||||
|
||||
|
@ -302,6 +320,23 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
|||
}
|
||||
#endif
|
||||
|
||||
while(true) {
|
||||
for(i=0; i < MAX_PARALLEL_NUMBER; i++) {
|
||||
#if _STDC_VERSION__ >= 201112L
|
||||
_Bool inuse = false;
|
||||
if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) {
|
||||
#else
|
||||
if(blas_buffer_inuse[i] == false) {
|
||||
blas_buffer_inuse[i] = true;
|
||||
#endif
|
||||
buf_index = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(i != MAX_PARALLEL_NUMBER)
|
||||
break;
|
||||
}
|
||||
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (i = 0; i < num; i ++) {
|
||||
|
||||
|
@ -309,9 +344,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
|||
queue[i].position = i;
|
||||
#endif
|
||||
|
||||
exec_threads(&queue[i]);
|
||||
exec_threads(&queue[i], buf_index);
|
||||
}
|
||||
|
||||
#if _STDC_VERSION__ >= 201112L
|
||||
atomic_store(&blas_buffer_inuse[buf_index], false);
|
||||
#else
|
||||
blas_buffer_inuse[buf_index] = false;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue