diff --git a/CMakeLists.txt b/CMakeLists.txt index ae95734cb..b8602da96 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 1) +set(OpenBLAS_PATCH_VERSION 2) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions diff --git a/Makefile b/Makefile index 56b4426f8..b947c1198 100644 --- a/Makefile +++ b/Makefile @@ -21,6 +21,17 @@ ifeq ($(BUILD_RELAPACK), 1) RELA = re_lapack endif +ifeq ($(NO_FORTRAN), 1) +define NOFORTRAN +1 +endef +define NO_LAPACK +1 +endef +export NOFORTRAN +export NO_LAPACK +endif + LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench @@ -47,7 +58,7 @@ endif endif @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" -ifndef NOFORTRAN +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" endif ifneq ($(OSNAME), AIX) @@ -108,7 +119,7 @@ endif endif tests : -ifndef NOFORTRAN +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) touch $(LIBNAME) ifndef NO_FBLAS $(MAKE) -C test all @@ -210,7 +221,7 @@ netlib : else netlib : lapack_prebuild -ifndef NOFORTRAN +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib endif @@ -231,7 +242,7 @@ prof_lapack : lapack_prebuild @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof lapack_prebuild : -ifndef NOFORTRAN +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc @@ -274,21 +285,21 @@ endif endif large.tgz : -ifndef NOFORTRAN +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/large.tgz; fi endif timing.tgz : -ifndef NOFORTRAN +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/timing.tgz; fi endif lapack-timing : large.tgz timing.tgz -ifndef NOFORTRAN +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING diff --git a/Makefile.rule b/Makefile.rule index e0f48397f..c205c0c1c 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.1 +VERSION = 0.3.2 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index b783ef90d..f29bc3a75 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -85,7 +85,7 @@ if (NOT NOFORTRAN) endif () # Cannot run getarch on target if we are cross-compiling -if (DEFINED CORE AND CMAKE_CROSSCOMPILING) +if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE")) # Write to config as getarch would # TODO: Set up defines that getarch sets up based on every other target diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index a565fc0d5..d339a755f 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -68,7 +68,7 @@ endif() if (X86_64 OR X86) file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") -execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512) +execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) if (NO_AVX512 EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") endif() diff --git a/cpuid_power.c b/cpuid_power.c index 951204ae9..6c7baef4a 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -142,6 +142,52 @@ int detect(void){ return CPUTYPE_PPC970; #endif + +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) +int id; +id = __asm __volatile("mfpvr %0" : "=r"(id)); +switch ( id >> 16 ) { + case 0x4e: // POWER9 + return return CPUTYPE_POWER8; + break; + case 0x4d: + case 0x4b: // POWER8/8E + return CPUTYPE_POWER8; + break; + case 0x4a: + case 0x3f: // POWER7/7E + return CPUTYPE_POWER6; + break; + case 0x3e: + return CPUTYPE_POWER6; + break; + case 0x3a: + return CPUTYPE_POWER5; + break; + case 0x35: + case 0x38: // POWER4 /4+ + return CPUTYPE_POWER4; + break; + case 0x40: + case 0x41: // POWER3 /3+ + return CPUTYPE_POWER3; + break; + case 0x39: + case 0x3c: + case 0x44: + case 0x45: + return CPUTYPE_PPC970; + break; + case 0x70: + return CPUTYPE_CELL; + break; + case 0x8003: + return CPUTYPE_PPCG4; + break; + default: + return CPUTYPE_UNKNOWN; + } +#endif } void get_architecture(void){ diff --git a/cpuid_x86.c b/cpuid_x86.c index 89eb809b0..512ad877b 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1452,6 +1452,8 @@ int get_cpuname(void){ switch (model) { case 1: // AMD Ryzen + case 8: + // AMD Ryzen2 if(support_avx()) #ifndef NO_AVX2 return CPUTYPE_ZEN; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index ee3e3b9a9..aeb5e6ed4 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -344,6 +344,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) { + /* Make sure if no one is using workspace */ + START_RPCC(); + for (i = 0; i < args -> nthreads; i++) + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; + STOP_RPCC(waiting1); + #if defined(FUSED_GEMM) && !defined(TIMING) /* Fused operation to copy region of B into workspace and apply kernel */ @@ -381,15 +387,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } #endif - for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) { - /* Make sure if no one is using workspace */ - START_RPCC(); - while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; - STOP_RPCC(waiting1); - /* Set flag so other threads can access local region of B */ + /* Set flag so other threads can access local region of B */ + for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; - WMB; - } + WMB; } /* Get regions of B from other threads and apply kernel */ @@ -425,13 +426,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Clear synchronization flag if this thread is done with other region of B */ if (m_to - m_from == min_i) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; } } } while (current != mypos); - /* Iterate through steps of m + /* Iterate through steps of m * Note: First step has already been finished */ for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; @@ -461,14 +462,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, is, js); STOP_RPCC(kernel); - + #ifdef TIMING ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l; #endif - + /* Clear synchronization flag if this thread is done with region of B */ if (is + min_i >= m_to) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; } } diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 13794207c..1f67dc521 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -510,7 +510,7 @@ static gotoblas_t *get_coretype(void){ #ifndef NO_AVX2 return &gotoblas_HASWELL; #else - return &gotblas_SANDYBRIDGE; + return &gotoblas_SANDYBRIDGE; #endif else return &gotoblas_NEHALEM; @@ -607,7 +607,7 @@ static gotoblas_t *get_coretype(void){ } } } else if (exfamily == 8) { - if (model == 1) { + if (model == 1 || model == 8) { if(support_avx()) return &gotoblas_ZEN; else{ diff --git a/driver/others/memory.c b/driver/others/memory.c index 98bcfb216..959837a52 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -140,7 +140,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifndef BUFFERS_PER_THREAD -#ifdef USE_OPENMP +#ifdef USE_OPENMP_UNUSED #define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) #else #define BUFFERS_PER_THREAD NUM_BUFFERS @@ -363,7 +363,7 @@ int blas_get_cpu_number(void){ #endif // blas_goto_num = 0; -#ifndef USE_OPENMP +#ifndef USE_OPENMP_UNUSED blas_goto_num=openblas_num_threads_env(); if (blas_goto_num < 0) blas_goto_num = 0; @@ -494,10 +494,10 @@ static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t); #endif /* Holds pointers to allocated memory */ -#if defined(SMP) && !defined(USE_OPENMP) +#if defined(SMP) && !defined(USE_OPENMP_UNUSED) /* This is the number of threads than can be spawned by the server, which is the server plus the number of threads in the thread pool */ -# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +1 +# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER * 2 static int next_memory_table_pos = 0; # if defined(HAS_COMPILER_TLS) /* Use compiler generated thread-local-storage */ @@ -532,7 +532,7 @@ static BLASULONG alloc_lock = 0UL; /* Returns a pointer to the start of the per-thread memory allocation data */ static __inline struct alloc_t ** get_memory_table() { -#if defined(SMP) && !defined(USE_OPENMP) +#if defined(SMP) && !defined(USE_OPENMP_UNUSED) # if !defined(HAS_COMPILER_TLS) # if defined(OS_WINDOWS) int local_memory_table_pos = (int)::TlsGetValue(local_storage_key); @@ -1057,7 +1057,7 @@ static volatile int memory_initialized = 0; /* 2 : Thread */ static void blas_memory_init(){ -#if defined(SMP) && !defined(USE_OPENMP) +#if defined(SMP) && !defined(USE_OPENMP_UNUSED) next_memory_table_pos = 0; # if !defined(HAS_COMPILER_TLS) # if defined(OS_WINDOWS) @@ -1279,7 +1279,7 @@ void blas_shutdown(void){ struct alloc_t *alloc_info = local_memory_table[thread][pos]; if (alloc_info) { alloc_info->release_func(alloc_info); - alloc_info = (void *)0; + local_memory_table[thread][pos] = (void *)0; } } } diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 87a27712f..3e87f2cc2 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -35,6 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#if defined(_WIN32) && defined(_MSC_VER) +#if _MSC_VER < 1900 +#define snprintf _snprintf +#endif +#endif + static char* openblas_config_str="" #ifdef USE64BITINT "USE64BITINT " diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index 6afb2cf13..e257dcfc9 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -1,3 +1,12 @@ +CAXPYKERNEL = ../mips/zaxpy.c +ZAXPYKERNEL = ../mips/zaxpy.c +SROTKERNEL = ../mips/rot.c +DROTKERNEL = ../mips/rot.c +CROTKERNEL = ../mips/zrot.c +ZROTKERNEL = ../mips/zrot.c +CSWAPKERNEL = ../mips/zswap.c +ZSWAPKERNEL = ../mips/zswap.c + ifndef SNRM2KERNEL SNRM2KERNEL = snrm2.S endif diff --git a/kernel/mips64/dot.S b/kernel/mips64/dot.S index cb6fbe99c..a645495f4 100644 --- a/kernel/mips64/dot.S +++ b/kernel/mips64/dot.S @@ -103,35 +103,83 @@ .align 3 .L12: +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 +#endif LD a1, 4 * SIZE(X) LD b1, 4 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a2, a2 +cvt.d.s b2, b2 +madd.d s2, s2, a2, b2 +#else MADD s2, s2, a2, b2 +#endif LD a2, 5 * SIZE(X) LD b2, 5 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a3, a3 +cvt.d.s b3, b3 +madd.d s1, s1, a3, b3 +#else MADD s1, s1, a3, b3 +#endif LD a3, 6 * SIZE(X) LD b3, 6 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a4, a4 +cvt.d.s b4, b4 +madd.d s2, s2, a4, b4 +#else MADD s2, s2, a4, b4 +#endif LD a4, 7 * SIZE(X) LD b4, 7 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 +#endif LD a1, 8 * SIZE(X) LD b1, 8 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a2, a2 +cvt.d.s b2, b2 +madd.d s2, s2, a2, b2 +#else MADD s2, s2, a2, b2 +#endif LD a2, 9 * SIZE(X) LD b2, 9 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a3, a3 +cvt.d.s b3, b3 +madd.d s1, s1, a3, b3 +#else MADD s1, s1, a3, b3 +#endif LD a3, 10 * SIZE(X) LD b3, 10 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a4, a4 +cvt.d.s b4, b4 +madd.d s2, s2, a4, b4 +#else MADD s2, s2, a4, b4 +#endif LD a4, 11 * SIZE(X) LD b4, 11 * SIZE(Y) @@ -143,29 +191,77 @@ .align 3 .L13: +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 +#endif LD a1, 4 * SIZE(X) LD b1, 4 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a2, a2 +cvt.d.s b2, b2 +madd.d s2, s2, a2, b2 +#else MADD s2, s2, a2, b2 +#endif LD a2, 5 * SIZE(X) LD b2, 5 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a3, a3 +cvt.d.s b3, b3 +madd.d s1, s1, a3, b3 +#else MADD s1, s1, a3, b3 +#endif LD a3, 6 * SIZE(X) LD b3, 6 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a4, a4 +cvt.d.s b4, b4 +madd.d s2, s2, a4, b4 +#else MADD s2, s2, a4, b4 +#endif LD a4, 7 * SIZE(X) LD b4, 7 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 +#endif daddiu X, X, 8 * SIZE +#ifdef DSDOT +cvt.d.s a2, a2 +cvt.d.s b2, b2 +madd.d s2, s2, a2, b2 +#else MADD s2, s2, a2, b2 +#endif daddiu Y, Y, 8 * SIZE +#ifdef DSDOT +cvt.d.s a3, a3 +cvt.d.s b3, b3 +madd.d s1, s1, a3, b3 +#else MADD s1, s1, a3, b3 +#endif +#ifdef DSDOT +cvt.d.s a4, a4 +cvt.d.s b4, b4 +madd.d s2, s2, a4, b4 +#else MADD s2, s2, a4, b4 +#endif .align 3 .L15: @@ -179,8 +275,13 @@ LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 - +#endif daddiu I, I, -1 daddiu X, X, SIZE @@ -225,50 +326,85 @@ LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s2, s2, a1, b1 +#else MADD s2, s2, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s2, s2, a1, b1 +#else MADD s2, s2, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s2, s2, a1, b1 +#else MADD s2, s2, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) @@ -277,7 +413,13 @@ daddiu I, I, -1 bgtz I, .L23 +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s2, s2, a1, b1 +#else MADD s2, s2, a1, b1 +#endif .align 3 .L25: @@ -296,13 +438,20 @@ daddiu I, I, -1 bgtz I, .L26 +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 +#endif .align 3 .L999: - ADD s1, s1, s2 #ifdef DSDOT - cvt.d.s s1, s1 + add.d s1, s1, s2 +#else + ADD s1, s1, s2 #endif j $31 NOP diff --git a/utest/ctest.h b/utest/ctest.h index 1deea32f6..f297dafba 100644 --- a/utest/ctest.h +++ b/utest/ctest.h @@ -84,7 +84,7 @@ struct ctest { #endif #if _MSC_VER < 1900 -#define snprintf _snprintf_s +#define snprintf _snprintf #endif #ifndef __cplusplus