From 8396e9e7774537b95ea1409f90d6e98d5d5a6800 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 27 Jun 2018 00:00:27 +0200 Subject: [PATCH 01/20] Handle NOFORTRAN=0 --- Makefile | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 49dab6484..9a7a25bfc 100644 --- a/Makefile +++ b/Makefile @@ -58,7 +58,7 @@ endif endif @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" -ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" endif ifneq ($(OSNAME), AIX) @@ -119,7 +119,7 @@ endif endif tests : -ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) touch $(LIBNAME) ifndef NO_FBLAS $(MAKE) -C test all @@ -221,7 +221,7 @@ netlib : else netlib : lapack_prebuild -ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib endif @@ -242,10 +242,10 @@ prof_lapack : lapack_prebuild @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof lapack_prebuild : - $(info filter value of NOFORTRAN is:) - $(info x$(filter-out $(NOFORTRAN), 1 2)x) + $(info filter value of x$(NOFORTRAN)x is:) + $(info x$(filter 0,$(NOFORTRAN))x) -ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc @@ -288,21 +288,21 @@ endif endif large.tgz : -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/large.tgz; fi endif timing.tgz : -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/timing.tgz; fi endif lapack-timing : large.tgz timing.tgz -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING From cc92257ea6f53fd1e315af08f5981686212a4781 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 27 Jun 2018 00:09:21 +0200 Subject: [PATCH 02/20] Update Makefile --- Makefile | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Makefile b/Makefile index 9b9a1f795..b947c1198 100644 --- a/Makefile +++ b/Makefile @@ -21,6 +21,17 @@ ifeq ($(BUILD_RELAPACK), 1) RELA = re_lapack endif +ifeq ($(NO_FORTRAN), 1) +define NOFORTRAN +1 +endef +define NO_LAPACK +1 +endef +export NOFORTRAN +export NO_LAPACK +endif + LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench From 1392eba488b70c8fb7156ef506037adb1979faf3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 1 Jul 2018 12:01:16 +0200 Subject: [PATCH 03/20] set version number to 0.3.2.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a2421ac54..1bc570961 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 1.dev) +set(OpenBLAS_PATCH_VERSION 2.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From bbf212497062827e4e6d98025f22c2fc47afd918 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 1 Jul 2018 12:01:51 +0200 Subject: [PATCH 04/20] set version number to 0.3.2.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 649aabe70..a3f3b23b9 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.1.dev +VERSION = 0.3.2.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 9d15a3bd16d5548701474d6ecf618b669a4ff394 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 2 Jul 2018 14:40:41 +0200 Subject: [PATCH 05/20] Fix typo that broke compilation with DYNAMIC_ARCH and NO_AVX2 fixes 1659 --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 13794207c..d727f1045 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -510,7 +510,7 @@ static gotoblas_t *get_coretype(void){ #ifndef NO_AVX2 return &gotoblas_HASWELL; #else - return &gotblas_SANDYBRIDGE; + return &gotoblas_SANDYBRIDGE; #endif else return &gotoblas_NEHALEM; From b74aef28165c058cc11c74bc5a7b00ddfce15b31 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Jul 2018 14:41:44 +0200 Subject: [PATCH 06/20] Add -march=skylake-avx512 to AVX512 compile check and suppress its output --- cmake/system_check.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index a565fc0d5..d339a755f 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -68,7 +68,7 @@ endif() if (X86_64 OR X86) file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") -execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512) +execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) if (NO_AVX512 EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") endif() From a49203b48c4a3d6f86413fc8c4b1fbfaa1946463 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Jul 2018 17:35:54 +0200 Subject: [PATCH 07/20] Double MAX_ALLOCATING_THREADS to fix segfaults with Go and Octave for #1641 --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 98bcfb216..a8b76a85a 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -497,7 +497,7 @@ static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t); #if defined(SMP) && !defined(USE_OPENMP) /* This is the number of threads than can be spawned by the server, which is the server plus the number of threads in the thread pool */ -# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +1 +# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER * 2 static int next_memory_table_pos = 0; # if defined(HAS_COMPILER_TLS) /* Use compiler generated thread-local-storage */ From 3f73e8b8cfcfb9c5fb40b75dd5e4435487db0655 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Jul 2018 21:01:35 +0200 Subject: [PATCH 08/20] Add cpuid for AMD Ryzen 2 for #1664 --- cpuid_x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 89eb809b0..512ad877b 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1452,6 +1452,8 @@ int get_cpuname(void){ switch (model) { case 1: // AMD Ryzen + case 8: + // AMD Ryzen2 if(support_avx()) #ifndef NO_AVX2 return CPUTYPE_ZEN; From d0ec4325cf2b5bf5b9a11c3f173f7ef2dd10d79e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Jul 2018 21:03:24 +0200 Subject: [PATCH 09/20] Add cpuid for AMD Ryzen 2 --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 13794207c..f72902411 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -607,7 +607,7 @@ static gotoblas_t *get_coretype(void){ } } } else if (exfamily == 8) { - if (model == 1) { + if (model == 1 || model == 8) { if(support_avx()) return &gotoblas_ZEN; else{ From 5f2a3c05cd0e3872be3c5686b9da6b627658eeb7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Jul 2018 21:42:28 +0200 Subject: [PATCH 10/20] Revert "Rewrite &= -> = and simplify the initial blocking phase." --- driver/level3/level3_thread.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index ee3e3b9a9..aeb5e6ed4 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -344,6 +344,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) { + /* Make sure if no one is using workspace */ + START_RPCC(); + for (i = 0; i < args -> nthreads; i++) + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; + STOP_RPCC(waiting1); + #if defined(FUSED_GEMM) && !defined(TIMING) /* Fused operation to copy region of B into workspace and apply kernel */ @@ -381,15 +387,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } #endif - for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) { - /* Make sure if no one is using workspace */ - START_RPCC(); - while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; - STOP_RPCC(waiting1); - /* Set flag so other threads can access local region of B */ + /* Set flag so other threads can access local region of B */ + for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; - WMB; - } + WMB; } /* Get regions of B from other threads and apply kernel */ @@ -425,13 +426,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Clear synchronization flag if this thread is done with other region of B */ if (m_to - m_from == min_i) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; } } } while (current != mypos); - /* Iterate through steps of m + /* Iterate through steps of m * Note: First step has already been finished */ for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; @@ -461,14 +462,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, is, js); STOP_RPCC(kernel); - + #ifdef TIMING ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l; #endif - + /* Clear synchronization flag if this thread is done with region of B */ if (is + min_i >= m_to) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; } } From 045fb5ea2c5b3e64e0ed747d4227ee8f1063ca05 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 12 Jul 2018 07:30:58 +0200 Subject: [PATCH 11/20] Define snprintf for older versions of MSVC for #1677 --- driver/others/openblas_get_config.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 87a27712f..ecafa16c4 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -35,6 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#if defined(_WIN32) && defined(_MSC_VER) +#if _MSC_VER < 1900 +#define snprintf _snprintf_s +#endif +#endif + static char* openblas_config_str="" #ifdef USE64BITINT "USE64BITINT " From 571e9de2ac77d838ba47bb7ec6981c7a5b5e68d0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 12 Jul 2018 11:42:25 +0200 Subject: [PATCH 12/20] Fix definition of snprintf for MSVC MS _snprintf_s takes an additional argument for the size of the buffer, so is not a direct replacement (utest/ctest.h from which I copied was wrong) --- driver/others/openblas_get_config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index ecafa16c4..3e87f2cc2 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(_WIN32) && defined(_MSC_VER) #if _MSC_VER < 1900 -#define snprintf _snprintf_s +#define snprintf _snprintf #endif #endif From 1309711e243ee945908b0c6139e9ea35c12e97f1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 12 Jul 2018 11:47:52 +0200 Subject: [PATCH 13/20] Fix declaration of snprintf for older MSVC _snprintf_s takes an additional (size) argument, so is no direct replacement. (Note that this code is currently unused - the two instances of snprintf here are within ifdef blocks that are not compiled for MSVC) --- utest/ctest.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utest/ctest.h b/utest/ctest.h index 1deea32f6..f297dafba 100644 --- a/utest/ctest.h +++ b/utest/ctest.h @@ -84,7 +84,7 @@ struct ctest { #endif #if _MSC_VER < 1900 -#define snprintf _snprintf_s +#define snprintf _snprintf #endif #ifndef __cplusplus From 8d5b33b6be7877d5df3f120d800f25cf900ee4c0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 12 Jul 2018 23:39:00 +0200 Subject: [PATCH 14/20] Add cpu identification via mfpvr call for the BSDs fixes #1671 --- cpuid_power.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/cpuid_power.c b/cpuid_power.c index 951204ae9..6c7baef4a 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -142,6 +142,52 @@ int detect(void){ return CPUTYPE_PPC970; #endif + +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) +int id; +id = __asm __volatile("mfpvr %0" : "=r"(id)); +switch ( id >> 16 ) { + case 0x4e: // POWER9 + return return CPUTYPE_POWER8; + break; + case 0x4d: + case 0x4b: // POWER8/8E + return CPUTYPE_POWER8; + break; + case 0x4a: + case 0x3f: // POWER7/7E + return CPUTYPE_POWER6; + break; + case 0x3e: + return CPUTYPE_POWER6; + break; + case 0x3a: + return CPUTYPE_POWER5; + break; + case 0x35: + case 0x38: // POWER4 /4+ + return CPUTYPE_POWER4; + break; + case 0x40: + case 0x41: // POWER3 /3+ + return CPUTYPE_POWER3; + break; + case 0x39: + case 0x3c: + case 0x44: + case 0x45: + return CPUTYPE_PPC970; + break; + case 0x70: + return CPUTYPE_CELL; + break; + case 0x8003: + return CPUTYPE_PPCG4; + break; + default: + return CPUTYPE_UNKNOWN; + } +#endif } void get_architecture(void){ From 2fbfc64da8d4850bd2d1ba76c873b4b79acbac3b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 15 Jul 2018 17:09:55 +0200 Subject: [PATCH 15/20] Use C kernels for default c/zAXPY, xROT, c/zSWAP --- kernel/mips64/KERNEL | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index 6afb2cf13..57251d3df 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -1,3 +1,12 @@ +CAXPYKERNEL = ../mips/zaxpy.c +ZAXPYKERNEL = ../mips/zaxpy.c +SROTKERNEL = ../mips/rot.c +DROTKERNEL = ../mips/rot.c +CROTKERNEL = ../mips/zrot.c +ZROTKERNEL = ../mips/zrot.c +CSWAPKERNEL = ../mips/zswap.c +ZSWAPKERNEL = ../mips/zwap.c + ifndef SNRM2KERNEL SNRM2KERNEL = snrm2.S endif From d2142760e0a50a7b268fc64e7c4657449b1e7c0b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 15 Jul 2018 17:11:40 +0200 Subject: [PATCH 16/20] Fix precision problem in DSDOT --- kernel/mips64/dot.S | 169 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 159 insertions(+), 10 deletions(-) diff --git a/kernel/mips64/dot.S b/kernel/mips64/dot.S index cb6fbe99c..a645495f4 100644 --- a/kernel/mips64/dot.S +++ b/kernel/mips64/dot.S @@ -103,35 +103,83 @@ .align 3 .L12: +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 +#endif LD a1, 4 * SIZE(X) LD b1, 4 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a2, a2 +cvt.d.s b2, b2 +madd.d s2, s2, a2, b2 +#else MADD s2, s2, a2, b2 +#endif LD a2, 5 * SIZE(X) LD b2, 5 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a3, a3 +cvt.d.s b3, b3 +madd.d s1, s1, a3, b3 +#else MADD s1, s1, a3, b3 +#endif LD a3, 6 * SIZE(X) LD b3, 6 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a4, a4 +cvt.d.s b4, b4 +madd.d s2, s2, a4, b4 +#else MADD s2, s2, a4, b4 +#endif LD a4, 7 * SIZE(X) LD b4, 7 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 +#endif LD a1, 8 * SIZE(X) LD b1, 8 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a2, a2 +cvt.d.s b2, b2 +madd.d s2, s2, a2, b2 +#else MADD s2, s2, a2, b2 +#endif LD a2, 9 * SIZE(X) LD b2, 9 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a3, a3 +cvt.d.s b3, b3 +madd.d s1, s1, a3, b3 +#else MADD s1, s1, a3, b3 +#endif LD a3, 10 * SIZE(X) LD b3, 10 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a4, a4 +cvt.d.s b4, b4 +madd.d s2, s2, a4, b4 +#else MADD s2, s2, a4, b4 +#endif LD a4, 11 * SIZE(X) LD b4, 11 * SIZE(Y) @@ -143,29 +191,77 @@ .align 3 .L13: +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 +#endif LD a1, 4 * SIZE(X) LD b1, 4 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a2, a2 +cvt.d.s b2, b2 +madd.d s2, s2, a2, b2 +#else MADD s2, s2, a2, b2 +#endif LD a2, 5 * SIZE(X) LD b2, 5 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a3, a3 +cvt.d.s b3, b3 +madd.d s1, s1, a3, b3 +#else MADD s1, s1, a3, b3 +#endif LD a3, 6 * SIZE(X) LD b3, 6 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a4, a4 +cvt.d.s b4, b4 +madd.d s2, s2, a4, b4 +#else MADD s2, s2, a4, b4 +#endif LD a4, 7 * SIZE(X) LD b4, 7 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 +#endif daddiu X, X, 8 * SIZE +#ifdef DSDOT +cvt.d.s a2, a2 +cvt.d.s b2, b2 +madd.d s2, s2, a2, b2 +#else MADD s2, s2, a2, b2 +#endif daddiu Y, Y, 8 * SIZE +#ifdef DSDOT +cvt.d.s a3, a3 +cvt.d.s b3, b3 +madd.d s1, s1, a3, b3 +#else MADD s1, s1, a3, b3 +#endif +#ifdef DSDOT +cvt.d.s a4, a4 +cvt.d.s b4, b4 +madd.d s2, s2, a4, b4 +#else MADD s2, s2, a4, b4 +#endif .align 3 .L15: @@ -179,8 +275,13 @@ LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 - +#endif daddiu I, I, -1 daddiu X, X, SIZE @@ -225,50 +326,85 @@ LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s2, s2, a1, b1 +#else MADD s2, s2, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s2, s2, a1, b1 +#else MADD s2, s2, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s2, s2, a1, b1 +#else MADD s2, s2, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) @@ -277,7 +413,13 @@ daddiu I, I, -1 bgtz I, .L23 +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s2, s2, a1, b1 +#else MADD s2, s2, a1, b1 +#endif .align 3 .L25: @@ -296,13 +438,20 @@ daddiu I, I, -1 bgtz I, .L26 +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 +#endif .align 3 .L999: - ADD s1, s1, s2 #ifdef DSDOT - cvt.d.s s1, s1 + add.d s1, s1, s2 +#else + ADD s1, s1, s2 #endif j $31 NOP From 4e103c822cfd30c8de17ed86b0a1b0c314e6936b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 16 Jul 2018 12:56:39 +0200 Subject: [PATCH 17/20] typo fix --- kernel/mips64/KERNEL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index 57251d3df..e257dcfc9 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -5,7 +5,7 @@ DROTKERNEL = ../mips/rot.c CROTKERNEL = ../mips/zrot.c ZROTKERNEL = ../mips/zrot.c CSWAPKERNEL = ../mips/zswap.c -ZSWAPKERNEL = ../mips/zwap.c +ZSWAPKERNEL = ../mips/zswap.c ifndef SNRM2KERNEL SNRM2KERNEL = snrm2.S From b14f44d2adbe1ec8ede0cdf06fb8b09f3c4b6e43 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 19 Jul 2018 08:57:56 +0200 Subject: [PATCH 18/20] Temporarily disable special handling of OPENMP thread memory allocation for issue #1673 --- driver/others/memory.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 98bcfb216..772c1f232 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -140,7 +140,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifndef BUFFERS_PER_THREAD -#ifdef USE_OPENMP +#ifdef USE_OPENMP_UNUSED #define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) #else #define BUFFERS_PER_THREAD NUM_BUFFERS @@ -363,7 +363,7 @@ int blas_get_cpu_number(void){ #endif // blas_goto_num = 0; -#ifndef USE_OPENMP +#ifndef USE_OPENMP_UNUSED blas_goto_num=openblas_num_threads_env(); if (blas_goto_num < 0) blas_goto_num = 0; @@ -494,7 +494,7 @@ static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t); #endif /* Holds pointers to allocated memory */ -#if defined(SMP) && !defined(USE_OPENMP) +#if defined(SMP) && !defined(USE_OPENMP_UNUSED) /* This is the number of threads than can be spawned by the server, which is the server plus the number of threads in the thread pool */ # define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +1 @@ -532,7 +532,7 @@ static BLASULONG alloc_lock = 0UL; /* Returns a pointer to the start of the per-thread memory allocation data */ static __inline struct alloc_t ** get_memory_table() { -#if defined(SMP) && !defined(USE_OPENMP) +#if defined(SMP) && !defined(USE_OPENMP_UNUSED) # if !defined(HAS_COMPILER_TLS) # if defined(OS_WINDOWS) int local_memory_table_pos = (int)::TlsGetValue(local_storage_key); @@ -1057,7 +1057,7 @@ static volatile int memory_initialized = 0; /* 2 : Thread */ static void blas_memory_init(){ -#if defined(SMP) && !defined(USE_OPENMP) +#if defined(SMP) && !defined(USE_OPENMP_UNUSED) next_memory_table_pos = 0; # if !defined(HAS_COMPILER_TLS) # if defined(OS_WINDOWS) From 43ac839c168c652e52320267b0504e6933cb9f60 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 22 Jul 2018 09:19:19 +0200 Subject: [PATCH 19/20] Unset memory table entry, not just the temporary pointer to it on shutdown to fix crash with multiple instances of OpenBLAS, #1692 --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 98bcfb216..3bf6ba019 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1279,7 +1279,7 @@ void blas_shutdown(void){ struct alloc_t *alloc_info = local_memory_table[thread][pos]; if (alloc_info) { alloc_info->release_func(alloc_info); - alloc_info = (void *)0; + local_memory_table[thread][pos] = (void *)0; } } } From 73131fa30ac40029b51f49356cd0f1349a815e79 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 24 Jul 2018 17:46:33 +0200 Subject: [PATCH 20/20] Do not treat WIndows UWB builds as cross-compiling --- cmake/prebuild.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index b783ef90d..f29bc3a75 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -85,7 +85,7 @@ if (NOT NOFORTRAN) endif () # Cannot run getarch on target if we are cross-compiling -if (DEFINED CORE AND CMAKE_CROSSCOMPILING) +if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE")) # Write to config as getarch would # TODO: Set up defines that getarch sets up based on every other target