From 41e802443a3ceb72f7c031a8b77fa45633c7885b Mon Sep 17 00:00:00 2001 From: Baptiste Daroussin Date: Fri, 3 Apr 2020 06:20:42 +0200 Subject: [PATCH 1/5] libname: treat FreeBSD and DragonFly like linux and sunos There is no difference in the way libnames are handle between FreeBSD and linux or sunos. FreeBSD and DragonFly prefers having sonames as well --- Makefile | 4 ++-- Makefile.install | 4 ++-- exports/Makefile | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 0b920cc9f..18320e6a3 100644 --- a/Makefile +++ b/Makefile @@ -112,12 +112,12 @@ endif shared : ifneq ($(NO_SHARED), 1) -ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) @$(MAKE) -C exports so @ln -fs $(LIBSONAME) $(LIBPREFIX).so @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif -ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) +ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD)) @$(MAKE) -C exports so @ln -fs $(LIBSONAME) $(LIBPREFIX).so endif diff --git a/Makefile.install b/Makefile.install index 2dc32c3d9..dad869f4c 100644 --- a/Makefile.install +++ b/Makefile.install @@ -68,14 +68,14 @@ endif #for install shared library ifneq ($(NO_SHARED),1) @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) -ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif -ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) +ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD)) @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so diff --git a/exports/Makefile b/exports/Makefile index d32e449df..60291b1ff 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -126,7 +126,7 @@ endif dllinit.$(SUFFIX) : dllinit.c $(CC) $(CFLAGS) -c -o $(@F) -s $< -ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) so : ../$(LIBSONAME) @@ -171,7 +171,7 @@ endif endif #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or -ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) +ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD)) so : ../$(LIBSONAME) From 69f277f8eea068a7283543620d9f4c2402dbaa8e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 Apr 2020 11:04:51 +0200 Subject: [PATCH 2/5] Add another memory barrier for ARM and a multicore test run on ThunderX to help detect such issues (#2544) * Add another memory barrier in memory.c to prevent races in memory slot allocation * Add an all-core test on Drone.io's ThunderX platform and modify dgemm_tester to use all 96 cores --- .drone.yml | 25 +++++++++++++++++++++++++ cpp_thread_test/dgemm_thread_safety.cpp | 2 +- driver/others/memory.c | 2 +- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/.drone.yml b/.drone.yml index 696c5a99d..3bbd8fc88 100644 --- a/.drone.yml +++ b/.drone.yml @@ -141,3 +141,28 @@ steps: - cmake $CMAKE_FLAGS .. - make -j - ctest -V + +--- +kind: pipeline +name: arm64_native_test + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: ubuntu:19.04 + environment: + CC: gcc + COMMON_FLAGS: 'USE_OPENMP=1' + commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" + - apt-get update -y + - apt-get install -y make $CC gfortran perl python g++ + - $CC --version + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + - make -C ctest $COMMON_FLAGS + - make -C utest $COMMON_FLAGS + - make -C cpp_thread_test dgemm_tester diff --git a/cpp_thread_test/dgemm_thread_safety.cpp b/cpp_thread_test/dgemm_thread_safety.cpp index cecf794fa..1b6ad3826 100644 --- a/cpp_thread_test/dgemm_thread_safety.cpp +++ b/cpp_thread_test/dgemm_thread_safety.cpp @@ -12,7 +12,7 @@ void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMat int main(int argc, char* argv[]){ blasint randomMatSize = 1024; //dimension of the random square matrices used - uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested + uint32_t numConcurrentThreads = 96; //number of concurrent calls of the functions being tested uint32_t numTestRounds = 16; //number of testing rounds before success exit if (argc > 4){ diff --git a/driver/others/memory.c b/driver/others/memory.c index 62a5a0214..1af547fb2 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2740,7 +2740,7 @@ void *blas_memory_alloc(int procpos){ #ifdef DEBUG printf(" Position -> %d\n", position); #endif - +WMB; memory[position].used = 1; #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); From 7b4773b24d83ef81de6291da3d350ab98045d4a0 Mon Sep 17 00:00:00 2001 From: Sharvil Nanavati Date: Wed, 8 Apr 2020 12:47:41 -0700 Subject: [PATCH 3/5] Add API to set thread affinity on Linux. Issue: #2545 --- cblas.h | 5 +++++ driver/others/blas_server.c | 18 ++++++++++++++++++ openblas_config_template.h | 5 +++++ 3 files changed, 28 insertions(+) diff --git a/cblas.h b/cblas.h index 1a87074d6..4bc5588d8 100644 --- a/cblas.h +++ b/cblas.h @@ -25,6 +25,11 @@ char* openblas_get_config(void); /*Get the CPU corename on runtime.*/ char* openblas_get_corename(void); +#ifdef OPENBLAS_OS_LINUX +/* Sets thread affinity for OpenBLAS threads. `thread_idx` is in [0, openblas_get_num_threads()-1]. */ +int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set); +#endif + /* Get the parallelization type which is used by OpenBLAS */ int openblas_get_parallel(void); /* OpenBLAS is compiled for sequential use */ diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index aa0644845..f13b83dd4 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -72,6 +72,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU) #include +#include #include #include #include @@ -279,6 +280,23 @@ int get_node(void); static int increased_threads = 0; +#ifdef OS_LINUX +int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) { + const int active_threads = openblas_get_num_threads(); + + if (thread_idx < 0 || thread_idx >= active_threads) { + errno = EINVAL; + return -1; + } + + pthread_t thread = (thread_idx == active_threads - 1) + ? pthread_self() + : blas_threads[thread_idx]; + + return pthread_setaffinity_np(thread, cpusetsize, cpu_set); +} +#endif + static void* blas_thread_server(void *arg){ /* Thread identifier */ diff --git a/openblas_config_template.h b/openblas_config_template.h index 52dd49da2..49aea1cab 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -91,3 +91,8 @@ typedef int blasint; #define openblas_complex_xdouble_real(z) ((z).real) #define openblas_complex_xdouble_imag(z) ((z).imag) #endif + +/* Inclusion of Linux-specific header is needed for definition of cpu_set_t. */ +#ifdef OPENBLAS_OS_LINUX +#include +#endif From 8d07cf9b67152486bdbe456e97b3a7b1377fe63b Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 9 Apr 2020 19:25:13 +0800 Subject: [PATCH 4/5] Fix compilation problem on loongson platform Using "make TARGET=GENERIC" on loongson platform will get the following error messages: "make[1]: *** No rule to make target 'sgemm_incopy.o', needed by 'libs'" Add kernel/mips64/KERNEL.generic to slove the problem. --- kernel/mips64/KERNEL.generic | 160 +++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 kernel/mips64/KERNEL.generic diff --git a/kernel/mips64/KERNEL.generic b/kernel/mips64/KERNEL.generic new file mode 100644 index 000000000..17f2ef976 --- /dev/null +++ b/kernel/mips64/KERNEL.generic @@ -0,0 +1,160 @@ +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +#Pure C for other kernels +SAMAXKERNEL = ../mips/amax.c +DAMAXKERNEL = ../mips/amax.c +CAMAXKERNEL = ../mips/zamax.c +ZAMAXKERNEL = ../mips/zamax.c + +SAMINKERNEL = ../mips/amin.c +DAMINKERNEL = ../mips/amin.c +CAMINKERNEL = ../mips/zamin.c +ZAMINKERNEL = ../mips/zamin.c + +SMAXKERNEL = ../mips/max.c +DMAXKERNEL = ../mips/max.c + +SMINKERNEL = ../mips/min.c +DMINKERNEL = ../mips/min.c + +ISAMAXKERNEL = ../mips/iamax.c +IDAMAXKERNEL = ../mips/iamax.c +ICAMAXKERNEL = ../mips/izamax.c +IZAMAXKERNEL = ../mips/izamax.c + +ISAMINKERNEL = ../mips/iamin.c +IDAMINKERNEL = ../mips/iamin.c +ICAMINKERNEL = ../mips/izamin.c +IZAMINKERNEL = ../mips/izamin.c + +ISMAXKERNEL = ../mips/imax.c +IDMAXKERNEL = ../mips/imax.c + +ISMINKERNEL = ../mips/imin.c +IDMINKERNEL = ../mips/imin.c + +SASUMKERNEL = ../mips/asum.c +DASUMKERNEL = ../mips/asum.c +CASUMKERNEL = ../mips/zasum.c +ZASUMKERNEL = ../mips/zasum.c + +SSUMKERNEL = ../mips/sum.c +DSUMKERNEL = ../mips/sum.c +CSUMKERNEL = ../mips/zsum.c +ZSUMKERNEL = ../mips/zsum.c + +SAXPYKERNEL = ../mips/axpy.c +DAXPYKERNEL = ../mips/axpy.c +CAXPYKERNEL = ../mips/zaxpy.c +ZAXPYKERNEL = ../mips/zaxpy.c + +SCOPYKERNEL = ../mips/copy.c +DCOPYKERNEL = ../mips/copy.c +CCOPYKERNEL = ../mips/zcopy.c +ZCOPYKERNEL = ../mips/zcopy.c + +SDOTKERNEL = ../mips/dot.c +DDOTKERNEL = ../mips/dot.c +CDOTKERNEL = ../mips/zdot.c +ZDOTKERNEL = ../mips/zdot.c + +SNRM2KERNEL = ../mips/nrm2.c +DNRM2KERNEL = ../mips/nrm2.c +CNRM2KERNEL = ../mips/znrm2.c +ZNRM2KERNEL = ../mips/znrm2.c + +SROTKERNEL = ../mips/rot.c +DROTKERNEL = ../mips/rot.c +CROTKERNEL = ../mips/zrot.c +ZROTKERNEL = ../mips/zrot.c + +SSCALKERNEL = ../mips/scal.c +DSCALKERNEL = ../mips/scal.c +CSCALKERNEL = ../mips/zscal.c +ZSCALKERNEL = ../mips/zscal.c + +SSWAPKERNEL = ../mips/swap.c +DSWAPKERNEL = ../mips/swap.c +CSWAPKERNEL = ../mips/zswap.c +ZSWAPKERNEL = ../mips/zswap.c + +SGEMVNKERNEL = ../mips/gemv_n.c +DGEMVNKERNEL = ../mips/gemv_n.c +CGEMVNKERNEL = ../mips/zgemv_n.c +ZGEMVNKERNEL = ../mips/zgemv_n.c + +SGEMVTKERNEL = ../mips/gemv_t.c +DGEMVTKERNEL = ../mips/gemv_t.c +CGEMVTKERNEL = ../mips/zgemv_t.c +ZGEMVTKERNEL = ../mips/zgemv_t.c + +SSYMV_U_KERNEL = ../generic/symv_k.c +SSYMV_L_KERNEL = ../generic/symv_k.c +DSYMV_U_KERNEL = ../generic/symv_k.c +DSYMV_L_KERNEL = ../generic/symv_k.c +QSYMV_U_KERNEL = ../generic/symv_k.c +QSYMV_L_KERNEL = ../generic/symv_k.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c +XSYMV_U_KERNEL = ../generic/zsymv_k.c +XSYMV_L_KERNEL = ../generic/zsymv_k.c + +ZHEMV_U_KERNEL = ../generic/zhemv_k.c +ZHEMV_L_KERNEL = ../generic/zhemv_k.c + +CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c From 66f89c0aaf5f3b179baba7c974afe14291c606c5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 10 Apr 2020 22:06:44 +0200 Subject: [PATCH 5/5] Match thread count to machine capability --- cpp_thread_test/dgemm_thread_safety.cpp | 6 +++++- cpp_thread_test/dgemv_thread_safety.cpp | 4 ++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/cpp_thread_test/dgemm_thread_safety.cpp b/cpp_thread_test/dgemm_thread_safety.cpp index cecf794fa..1c5287524 100644 --- a/cpp_thread_test/dgemm_thread_safety.cpp +++ b/cpp_thread_test/dgemm_thread_safety.cpp @@ -12,9 +12,13 @@ void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMat int main(int argc, char* argv[]){ blasint randomMatSize = 1024; //dimension of the random square matrices used - uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested + uint32_t numConcurrentThreads = 96; //number of concurrent calls of the functions being tested uint32_t numTestRounds = 16; //number of testing rounds before success exit + uint32_t maxHwThreads = omp_get_max_threads(); + if (maxHwThreads < 96) + numConcurrentThreads = maxHwThreads; + if (argc > 4){ std::cout<<"ERROR: too many arguments for thread safety tester"< 4){ std::cout<<"ERROR: too many arguments for thread safety tester"<