In OpenMP threading, preallocate the thread buffer instead of allocating the buffer every time. This patch improved the performance slightly.
This commit is contained in:
parent
3cc6ae793e
commit
d744c9590a
|
@ -49,8 +49,12 @@
|
||||||
|
|
||||||
int blas_server_avail = 0;
|
int blas_server_avail = 0;
|
||||||
|
|
||||||
|
static void * blas_thread_buffer[MAX_CPU_NUMBER];
|
||||||
|
|
||||||
void goto_set_num_threads(int num_threads) {
|
void goto_set_num_threads(int num_threads) {
|
||||||
|
|
||||||
|
int i=0;
|
||||||
|
|
||||||
if (num_threads < 1) num_threads = blas_num_threads;
|
if (num_threads < 1) num_threads = blas_num_threads;
|
||||||
|
|
||||||
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
|
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
|
||||||
|
@ -63,6 +67,18 @@ void goto_set_num_threads(int num_threads) {
|
||||||
|
|
||||||
omp_set_num_threads(blas_cpu_number);
|
omp_set_num_threads(blas_cpu_number);
|
||||||
|
|
||||||
|
//adjust buffer for each thread
|
||||||
|
for(i=0; i<blas_cpu_number; i++){
|
||||||
|
if(blas_thread_buffer[i]==NULL){
|
||||||
|
blas_thread_buffer[i]=blas_memory_alloc(2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(; i<MAX_CPU_NUMBER; i++){
|
||||||
|
if(blas_thread_buffer[i]!=NULL){
|
||||||
|
blas_memory_free(blas_thread_buffer[i]);
|
||||||
|
blas_thread_buffer[i]=NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
#if defined(ARCH_MIPS64)
|
#if defined(ARCH_MIPS64)
|
||||||
//set parameters for different number of threads.
|
//set parameters for different number of threads.
|
||||||
blas_set_parameter();
|
blas_set_parameter();
|
||||||
|
@ -76,17 +92,33 @@ void openblas_set_num_threads(int num_threads) {
|
||||||
|
|
||||||
int blas_thread_init(void){
|
int blas_thread_init(void){
|
||||||
|
|
||||||
|
int i=0;
|
||||||
|
|
||||||
blas_get_cpu_number();
|
blas_get_cpu_number();
|
||||||
|
|
||||||
blas_server_avail = 1;
|
blas_server_avail = 1;
|
||||||
|
|
||||||
|
for(i=0; i<blas_num_threads; i++){
|
||||||
|
blas_thread_buffer[i]=blas_memory_alloc(2);
|
||||||
|
}
|
||||||
|
for(; i<MAX_CPU_NUMBER; i++){
|
||||||
|
blas_thread_buffer[i]=NULL;
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int BLASFUNC(blas_thread_shutdown)(void){
|
int BLASFUNC(blas_thread_shutdown)(void){
|
||||||
|
int i=0;
|
||||||
blas_server_avail = 0;
|
blas_server_avail = 0;
|
||||||
|
|
||||||
|
for(i=0; i<MAX_CPU_NUMBER; i++){
|
||||||
|
if(blas_thread_buffer[i]!=NULL){
|
||||||
|
blas_memory_free(blas_thread_buffer[i]);
|
||||||
|
blas_thread_buffer[i]=NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -177,6 +209,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
||||||
static void exec_threads(blas_queue_t *queue){
|
static void exec_threads(blas_queue_t *queue){
|
||||||
|
|
||||||
void *buffer, *sa, *sb;
|
void *buffer, *sa, *sb;
|
||||||
|
int pos=0, release_flag=0;
|
||||||
|
|
||||||
buffer = NULL;
|
buffer = NULL;
|
||||||
sa = queue -> sa;
|
sa = queue -> sa;
|
||||||
|
@ -189,7 +222,14 @@ static void exec_threads(blas_queue_t *queue){
|
||||||
|
|
||||||
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
|
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
|
||||||
|
|
||||||
|
pos = omp_get_thread_num();
|
||||||
|
buffer = blas_thread_buffer[pos];
|
||||||
|
|
||||||
|
//fallback
|
||||||
|
if(buffer==NULL) {
|
||||||
buffer = blas_memory_alloc(2);
|
buffer = blas_memory_alloc(2);
|
||||||
|
release_flag=1;
|
||||||
|
}
|
||||||
|
|
||||||
if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
||||||
|
|
||||||
|
@ -242,7 +282,7 @@ static void exec_threads(blas_queue_t *queue){
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (buffer != NULL) blas_memory_free(buffer);
|
if (release_flag) blas_memory_free(buffer);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue