From 5daf8c9d625a2ebdc806e807fe78ee1347152bf5 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Thu, 12 Sep 2019 14:01:56 -0400 Subject: [PATCH] openblas_threads_callback mechanism for changing threads backend --- cblas.h | 51 +++++++++++++++------------- common_interface.h | 14 +++++--- driver/others/CMakeLists.txt | 1 + driver/others/Makefile | 2 +- driver/others/blas_server_callback.c | 13 +++++++ driver/others/blas_server_omp.c | 24 +++++++++---- 6 files changed, 70 insertions(+), 35 deletions(-) create mode 100644 driver/others/blas_server_callback.c diff --git a/cblas.h b/cblas.h index 1a87074d6..768e0c92d 100644 --- a/cblas.h +++ b/cblas.h @@ -25,6 +25,11 @@ char* openblas_get_config(void); /*Get the CPU corename on runtime.*/ char* openblas_get_corename(void); +/*Set the threading backend to a custom callback.*/ +typedef void (*openblas_dojob_callback)(int thread_num, void *jobdata, void *dojob_data); +typedef void (*openblas_threads_callback)(void *callback_data, openblas_dojob_callback dojob, int numjobs, size_t jobdata_elsize, void *jobdata, void *dojob_data); +void openblas_set_threads_callback(openblas_threads_callback callback, void *callback_data); + /* Get the parallelization type which is used by OpenBLAS */ int openblas_get_parallel(void); /* OpenBLAS is compiled for sequential use */ @@ -52,7 +57,7 @@ typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; typedef CBLAS_ORDER CBLAS_LAYOUT; - + float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); float cblas_sdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); @@ -350,32 +355,32 @@ void cblas_caxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS void cblas_zaxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST void *beta, void *y, OPENBLAS_CONST blasint incy); -void cblas_somatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, OPENBLAS_CONST float *a, - OPENBLAS_CONST blasint clda, float *b, OPENBLAS_CONST blasint cldb); +void cblas_somatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, OPENBLAS_CONST float *a, + OPENBLAS_CONST blasint clda, float *b, OPENBLAS_CONST blasint cldb); void cblas_domatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, OPENBLAS_CONST double *a, - OPENBLAS_CONST blasint clda, double *b, OPENBLAS_CONST blasint cldb); -void cblas_comatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, OPENBLAS_CONST float* a, - OPENBLAS_CONST blasint clda, float*b, OPENBLAS_CONST blasint cldb); -void cblas_zomatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, OPENBLAS_CONST double* a, - OPENBLAS_CONST blasint clda, double *b, OPENBLAS_CONST blasint cldb); + OPENBLAS_CONST blasint clda, double *b, OPENBLAS_CONST blasint cldb); +void cblas_comatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, OPENBLAS_CONST float* a, + OPENBLAS_CONST blasint clda, float*b, OPENBLAS_CONST blasint cldb); +void cblas_zomatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, OPENBLAS_CONST double* a, + OPENBLAS_CONST blasint clda, double *b, OPENBLAS_CONST blasint cldb); -void cblas_simatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, - OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); +void cblas_simatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, + OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); void cblas_dimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a, - OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); -void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, float* a, - OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); -void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a, - OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); + OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); +void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, float* a, + OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); +void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a, + OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); -void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta, - float *c, OPENBLAS_CONST blasint cldc); -void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta, - double *c, OPENBLAS_CONST blasint cldc); -void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta, - float *c, OPENBLAS_CONST blasint cldc); -void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta, - double *c, OPENBLAS_CONST blasint cldc); +void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta, + float *c, OPENBLAS_CONST blasint cldc); +void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta, + double *c, OPENBLAS_CONST blasint cldc); +void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta, + float *c, OPENBLAS_CONST blasint cldc); +void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta, + double *c, OPENBLAS_CONST blasint cldc); #ifdef __cplusplus diff --git a/common_interface.h b/common_interface.h index c350ac8ec..7a58cafd2 100644 --- a/common_interface.h +++ b/common_interface.h @@ -47,6 +47,12 @@ int BLASFUNC(xerbla)(char *, blasint *info, blasint); void openblas_set_num_threads_(int *); +typedef void (*openblas_dojob_callback)(int thread_num, void *jobdata, void *dojob_data); +typedef void (*openblas_threads_callback)(void *callback_data, openblas_dojob_callback dojob, int numjobs, size_t jobdata_elsize, void *jobdata, void *dojob_data); +void openblas_set_threads_callback(openblas_threads_callback callback, void *callback_data); +extern openblas_threads_callback openblas_threads_callback_; +extern void *openblas_threads_callback_data_; + FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *); FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *); @@ -761,10 +767,10 @@ void BLASFUNC(dimatcopy) (char *, char *, blasint *, blasint *, double *, do void BLASFUNC(cimatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, blasint *); void BLASFUNC(zimatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, blasint *); -void BLASFUNC(sgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*); -void BLASFUNC(dgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*); -void BLASFUNC(cgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*); -void BLASFUNC(zgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*); +void BLASFUNC(sgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*); +void BLASFUNC(dgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*); +void BLASFUNC(cgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*); +void BLASFUNC(zgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*); #ifdef __cplusplus diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index a07e00b3b..e8ac88437 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -39,6 +39,7 @@ set(COMMON_SOURCES openblas_env.c openblas_get_num_procs.c openblas_get_num_threads.c + blas_server_callback.c ) # these need to have NAME/CNAME set, so use GenerateNamedObjects, but don't use standard name mangling diff --git a/driver/others/Makefile b/driver/others/Makefile index d4b5c26d5..e59c2681e 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -1,7 +1,7 @@ TOPDIR = ../.. include ../../Makefile.system -COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) openblas_env.$(SUFFIX) +COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) openblas_env.$(SUFFIX) blas_server_callback.$(SUFFIX) #COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) diff --git a/driver/others/blas_server_callback.c b/driver/others/blas_server_callback.c new file mode 100644 index 000000000..cf6e7d7fd --- /dev/null +++ b/driver/others/blas_server_callback.c @@ -0,0 +1,13 @@ +#include "common.h" + +/* global variable to change threading backend from openblas-managed to caller-managed */ +openblas_threads_callback openblas_threads_callback_ = 0; +void *openblas_threads_callback_data_ = 0; + +/* non-threadsafe function should be called before any other + openblas function to change how threads are managed */ +void openblas_set_threads_callback(openblas_threads_callback callback, void *callback_data) +{ + openblas_threads_callback_ = callback; + openblas_threads_callback_data_ = callback_data; +} \ No newline at end of file diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 4255852c8..a26e43be2 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -75,7 +75,8 @@ void goto_set_num_threads(int num_threads) { blas_cpu_number = num_threads; - omp_set_num_threads(blas_cpu_number); + if (!openblas_threads_callback_) + omp_set_num_threads(blas_cpu_number); //adjust buffer for each thread for(i=0; i sa; @@ -238,8 +238,7 @@ static void exec_threads(blas_queue_t *queue, int buf_index){ if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { - pos = omp_get_thread_num(); - buffer = blas_thread_buffer[buf_index][pos]; + buffer = blas_thread_buffer[*buf_index][thread_num]; //fallback if(buffer==NULL) { @@ -335,14 +334,25 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ break; } + if (openblas_threads_callback_) { +#ifndef USE_SIMPLE_THREADED_LEVEL3 + for (i = 0; i < num; i ++) + queue[i].position = i; +#endif + openblas_threads_callback_(openblas_threads_callback_data_, (openblas_dojob_callback) exec_threads, num, sizeof(blas_queue_t), (void*) queue, (void*) &buf_index); + return; + } + #pragma omp parallel for schedule(OMP_SCHED) + +#pragma omp parallel for schedule(static) for (i = 0; i < num; i ++) { #ifndef USE_SIMPLE_THREADED_LEVEL3 queue[i].position = i; #endif - exec_threads(&queue[i], buf_index); + exec_threads(omp_get_thread_num(), &queue[i], &buf_index); } #if __STDC_VERSION__ >= 201112L