diff --git a/Makefile.install b/Makefile.install index fa657beba..069c96c6a 100644 --- a/Makefile.install +++ b/Makefile.install @@ -48,6 +48,7 @@ ifndef NO_CBLAS @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" endif +ifneq ($(OSNAME), AIX) ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" @@ -72,6 +73,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif + ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @@ -93,6 +95,33 @@ ifeq ($(OSNAME), CYGWIN_NT) endif endif +else +#install on AIX has different options syntax +ifndef NO_LAPACKE + @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) + @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" + @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" + @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" + @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" +endif + +#for install static library +ifndef NO_STATIC + @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) + @installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ + ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) +endif +#for install shared library +ifndef NO_SHARED + @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) + @installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ + ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ + ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) +endif + +endif #Generating openblas.pc @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" diff --git a/Makefile.rule b/Makefile.rule index 6522b0777..d97607f2e 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -192,8 +192,8 @@ NO_AFFINITY = 1 # Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT # COMMON_OPT = -O2 -# gfortran option for LAPACK -# enable this flag only on 64bit Linux and if you need a thread safe lapack library +# gfortran option for LAPACK to improve thread-safety +# It is enabled by default in Makefile.system for gfortran # Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT # FCOMMON_OPT = -frecursive diff --git a/Makefile.system b/Makefile.system index 7847c7525..1427d09fb 100644 --- a/Makefile.system +++ b/Makefile.system @@ -725,6 +725,8 @@ endif ifeq ($(F_COMPILER), GFORTRAN) CCOMMON_OPT += -DF_INTERFACE_GFORT FCOMMON_OPT += -Wall +# make single-threaded LAPACK calls thread-safe #1847 +FCOMMON_OPT += -frecursive #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc ifneq ($(NO_LAPACK), 1) EXTRALIB += -lgfortran @@ -1211,7 +1213,11 @@ endif LIBDLLNAME = $(LIBPREFIX).dll IMPLIBNAME = lib$(LIBNAMEBASE).dll.a +ifneq ($(OSNAME), AIX) LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) +else +LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a) +endif LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 38d59f956..adec28a91 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -44,7 +44,7 @@ endif () if (${F_COMPILER} STREQUAL "GFORTRAN") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") - set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") + set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc if (NOT NO_LAPACK) set(EXTRALIB "{EXTRALIB} -lgfortran") diff --git a/common.h b/common.h index 6c3d5b15e..7fcd5e316 100644 --- a/common.h +++ b/common.h @@ -183,7 +183,7 @@ extern "C" { #define ALLOCA_ALIGN 63UL -#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) +#define NUM_BUFFERS MAX(50,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)) #ifdef NEEDBUNDERSCORE #define BLASFUNC(FUNC) FUNC##_ diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 17078fe7f..3acb395b5 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -115,8 +115,8 @@ int detect(void) fclose(infile); if(cpu_part != NULL && cpu_implementer != NULL) { if (strstr(cpu_implementer, "0x41") && - (strstr(cpu_part, "0xd07") || strstr(cpu_part,"0xd08") || strstr(cpu_part,"0xd03") )) - return CPU_CORTEXA57; //or compatible A53, A72 + (strstr(cpu_part, "0xd07") || strstr(cpu_part,"0xd08"))) + return CPU_CORTEXA57; //or compatible, ex. A72 else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42")) return CPU_VULCAN; else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43")) diff --git a/cpuid_x86.c b/cpuid_x86.c index 512ad877b..8e4a7cb84 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -2009,6 +2009,8 @@ int get_coretype(void){ switch (model) { case 1: // AMD Ryzen + case 8: + // Ryzen 2 if(support_avx()) #ifndef NO_AVX2 return CORE_ZEN; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index aeb5e6ed4..de29247d4 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -48,6 +48,10 @@ #define SWITCH_RATIO 2 #endif +#ifndef GEMM_PREFERED_SIZE +#define GEMM_PREFERED_SIZE 1 +#endif + //The array of job_t may overflow the stack. //Instead, use malloc to alloc job_t. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD @@ -510,6 +514,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, return 0; } +static int round_up(int remainder, int width, int multiple) +{ + if (multiple > remainder || width <= multiple) + return width; + width = (width + multiple - 1) / multiple; + width = width * multiple; + return width; +} + + static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG nthreads_m, BLASLONG nthreads_n) { @@ -601,9 +615,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG num_parts = 0; while (m > 0){ width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts); + + width = round_up(m, width, GEMM_PREFERED_SIZE); + m -= width; + if (m < 0) width = width + m; range_M[num_parts + 1] = range_M[num_parts] + width; + num_parts ++; } for (i = num_parts; i < MAX_CPU_NUMBER; i++) { @@ -645,9 +664,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG if (width < SWITCH_RATIO) { width = SWITCH_RATIO; } + width = round_up(n, width, GEMM_PREFERED_SIZE); + n -= width; if (n < 0) width = width + n; range_N[num_parts + 1] = range_N[num_parts] + width; + num_parts ++; } for (j = num_parts; j < MAX_CPU_NUMBER; j++) { diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 6a25e2d07..e5db1804f 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -850,6 +850,11 @@ void goto_set_num_threads(int num_threads) { long i; +#ifdef SMP_SERVER + // Handle lazy re-init of the thread-pool after a POSIX fork + if (unlikely(blas_server_avail == 0)) blas_thread_init(); +#endif + if (num_threads < 1) num_threads = blas_num_threads; #ifndef NO_AFFINITY diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 02a25ac39..bae344c59 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -478,7 +478,12 @@ int BLASFUNC(blas_thread_shutdown)(void){ void goto_set_num_threads(int num_threads) { - long i; + long i; + +#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) + // Handle lazy re-init of the thread-pool after a POSIX fork + if (unlikely(blas_server_avail == 0)) blas_thread_init(); +#endif if (num_threads < 1) num_threads = blas_cpu_number; diff --git a/driver/others/memory.c b/driver/others/memory.c index 4a8e6c067..25f198623 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -259,6 +259,16 @@ int get_num_procs(void) { } #endif +#ifdef OS_AIX +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif + + + #ifdef OS_WINDOWS int get_num_procs(void) { @@ -1738,6 +1748,22 @@ int get_num_procs(void) { return nums; } #endif + +#ifdef OS_HAIKU +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif + +#ifdef OS_AIX +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif #ifdef OS_WINDOWS diff --git a/interface/zswap.c b/interface/zswap.c index e33bbafba..372b15447 100644 --- a/interface/zswap.c +++ b/interface/zswap.c @@ -42,6 +42,14 @@ #include "functable.h" #endif +#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) +// Multithreaded swap gives performance benefits in ThunderX2T99 +#else +// Disable multi-threading as it does not show any performance +// benefits. Keep the multi-threading code for the record. +#undef SMP +#endif + #ifndef CBLAS void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ @@ -81,7 +89,7 @@ FLOAT *y = (FLOAT*)vy; #ifdef SMP //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. - if (incx == 0 || incy == 0) + if (incx == 0 || incy == 0 || n < 1048576 * GEMM_MULTITHREAD_THRESHOLD / sizeof(FLOAT)) nthreads = 1; else nthreads = num_cpu_avail(1); diff --git a/kernel/x86_64/dgemm_beta_skylakex.c b/kernel/x86_64/dgemm_beta_skylakex.c index 384e9f60b..6a824c9b5 100644 --- a/kernel/x86_64/dgemm_beta_skylakex.c +++ b/kernel/x86_64/dgemm_beta_skylakex.c @@ -55,6 +55,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, return 0; } + if (m == 0 || n == 0) + return 0; c_offset = c; @@ -69,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, i = m; - while (i > 32) { + while (i >= 32) { _mm512_storeu_pd(c_offset1, z_zero); _mm512_storeu_pd(c_offset1 + 8, z_zero); _mm512_storeu_pd(c_offset1 + 16, z_zero); @@ -77,7 +79,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset1 += 32; i -= 32; } - while (i > 8) { + while (i >= 8) { _mm512_storeu_pd(c_offset1, z_zero); c_offset1 += 8; i -= 8; diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index 54f9664e9..4e40acadf 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -55,6 +55,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, return 0; } + if (n == 0 || m == 0) + return; c_offset = c; @@ -71,13 +73,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, i = m; - while (i > 32) { + while (i >= 32) { _mm512_storeu_ps(c_offset1, z_zero); _mm512_storeu_ps(c_offset1 + 16, z_zero); c_offset1 += 32; i -= 32; } - while (i > 8) { + while (i >= 8) { _mm256_storeu_ps(c_offset1, y_zero); c_offset1 += 8; i -= 8; diff --git a/lapack-netlib/LAPACKE/include/lapacke_config.h b/lapack-netlib/LAPACKE/include/lapacke_config.h index 1e2509bf0..8262c3488 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_config.h +++ b/lapack-netlib/LAPACKE/include/lapacke_config.h @@ -34,6 +34,13 @@ #ifndef _LAPACKE_CONFIG_H_ #define _LAPACKE_CONFIG_H_ +// For Android prior to API 21 (no include) +#if defined(__ANDROID__) +#if __ANDROID_API__ < 21 +#define LAPACK_COMPLEX_STRUCTURE +#endif +#endif + #ifdef __cplusplus #if defined(LAPACK_COMPLEX_CPP) #include diff --git a/param.h b/param.h index e4ec1b2b5..d1b211584 100644 --- a/param.h +++ b/param.h @@ -1627,6 +1627,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 #define SWITCH_RATIO 32 +#define GEMM_PREFERED_SIZE 32 #ifdef ARCH_X86