Merge pull request #1702 from xianyi/develop

Merge develop for 0.3.2
This commit is contained in:
Martin Kroeker 2018-07-30 07:25:01 +02:00 committed by GitHub
commit e8a68ef261
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 268 additions and 44 deletions

View File

@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 1)
set(OpenBLAS_PATCH_VERSION 2)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions

View File

@ -21,6 +21,17 @@ ifeq ($(BUILD_RELAPACK), 1)
RELA = re_lapack
endif
ifeq ($(NO_FORTRAN), 1)
define NOFORTRAN
1
endef
define NO_LAPACK
1
endef
export NOFORTRAN
export NO_LAPACK
endif
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
@ -47,7 +58,7 @@ endif
endif
@echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
ifndef NOFORTRAN
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"
endif
ifneq ($(OSNAME), AIX)
@ -108,7 +119,7 @@ endif
endif
tests :
ifndef NOFORTRAN
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
touch $(LIBNAME)
ifndef NO_FBLAS
$(MAKE) -C test all
@ -210,7 +221,7 @@ netlib :
else
netlib : lapack_prebuild
ifndef NOFORTRAN
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
endif
@ -231,7 +242,7 @@ prof_lapack : lapack_prebuild
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
lapack_prebuild :
ifndef NOFORTRAN
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
@ -274,21 +285,21 @@ endif
endif
large.tgz :
ifndef NOFORTRAN
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
if [ ! -a $< ]; then
-wget http://www.netlib.org/lapack/timing/large.tgz;
fi
endif
timing.tgz :
ifndef NOFORTRAN
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
if [ ! -a $< ]; then
-wget http://www.netlib.org/lapack/timing/timing.tgz;
fi
endif
lapack-timing : large.tgz timing.tgz
ifndef NOFORTRAN
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
$(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING

View File

@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.3.1
VERSION = 0.3.2
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

View File

@ -85,7 +85,7 @@ if (NOT NOFORTRAN)
endif ()
# Cannot run getarch on target if we are cross-compiling
if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE"))
# Write to config as getarch would
# TODO: Set up defines that getarch sets up based on every other target

View File

@ -68,7 +68,7 @@ endif()
if (X86_64 OR X86)
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512)
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
if (NO_AVX512 EQUAL 1)
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
endif()

View File

@ -142,6 +142,52 @@ int detect(void){
return CPUTYPE_PPC970;
#endif
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__)
int id;
id = __asm __volatile("mfpvr %0" : "=r"(id));
switch ( id >> 16 ) {
case 0x4e: // POWER9
return return CPUTYPE_POWER8;
break;
case 0x4d:
case 0x4b: // POWER8/8E
return CPUTYPE_POWER8;
break;
case 0x4a:
case 0x3f: // POWER7/7E
return CPUTYPE_POWER6;
break;
case 0x3e:
return CPUTYPE_POWER6;
break;
case 0x3a:
return CPUTYPE_POWER5;
break;
case 0x35:
case 0x38: // POWER4 /4+
return CPUTYPE_POWER4;
break;
case 0x40:
case 0x41: // POWER3 /3+
return CPUTYPE_POWER3;
break;
case 0x39:
case 0x3c:
case 0x44:
case 0x45:
return CPUTYPE_PPC970;
break;
case 0x70:
return CPUTYPE_CELL;
break;
case 0x8003:
return CPUTYPE_PPCG4;
break;
default:
return CPUTYPE_UNKNOWN;
}
#endif
}
void get_architecture(void){

View File

@ -1452,6 +1452,8 @@ int get_cpuname(void){
switch (model) {
case 1:
// AMD Ryzen
case 8:
// AMD Ryzen2
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_ZEN;

View File

@ -344,6 +344,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {
/* Make sure if no one is using workspace */
START_RPCC();
for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
STOP_RPCC(waiting1);
#if defined(FUSED_GEMM) && !defined(TIMING)
/* Fused operation to copy region of B into workspace and apply kernel */
@ -381,16 +387,11 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
}
#endif
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) {
/* Make sure if no one is using workspace */
START_RPCC();
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
STOP_RPCC(waiting1);
/* Set flag so other threads can access local region of B */
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++)
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
WMB;
}
}
/* Get regions of B from other threads and apply kernel */
current = mypos;
@ -425,7 +426,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Clear synchronization flag if this thread is done with other region of B */
if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
}
}
@ -468,7 +469,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Clear synchronization flag if this thread is done with region of B */
if (is + min_i >= m_to) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
}
}

View File

@ -510,7 +510,7 @@ static gotoblas_t *get_coretype(void){
#ifndef NO_AVX2
return &gotoblas_HASWELL;
#else
return &gotblas_SANDYBRIDGE;
return &gotoblas_SANDYBRIDGE;
#endif
else
return &gotoblas_NEHALEM;
@ -607,7 +607,7 @@ static gotoblas_t *get_coretype(void){
}
}
} else if (exfamily == 8) {
if (model == 1) {
if (model == 1 || model == 8) {
if(support_avx())
return &gotoblas_ZEN;
else{

View File

@ -140,7 +140,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifndef BUFFERS_PER_THREAD
#ifdef USE_OPENMP
#ifdef USE_OPENMP_UNUSED
#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
#else
#define BUFFERS_PER_THREAD NUM_BUFFERS
@ -363,7 +363,7 @@ int blas_get_cpu_number(void){
#endif
// blas_goto_num = 0;
#ifndef USE_OPENMP
#ifndef USE_OPENMP_UNUSED
blas_goto_num=openblas_num_threads_env();
if (blas_goto_num < 0) blas_goto_num = 0;
@ -494,10 +494,10 @@ static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t);
#endif
/* Holds pointers to allocated memory */
#if defined(SMP) && !defined(USE_OPENMP)
#if defined(SMP) && !defined(USE_OPENMP_UNUSED)
/* This is the number of threads than can be spawned by the server, which is the
server plus the number of threads in the thread pool */
# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +1
# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER * 2
static int next_memory_table_pos = 0;
# if defined(HAS_COMPILER_TLS)
/* Use compiler generated thread-local-storage */
@ -532,7 +532,7 @@ static BLASULONG alloc_lock = 0UL;
/* Returns a pointer to the start of the per-thread memory allocation data */
static __inline struct alloc_t ** get_memory_table() {
#if defined(SMP) && !defined(USE_OPENMP)
#if defined(SMP) && !defined(USE_OPENMP_UNUSED)
# if !defined(HAS_COMPILER_TLS)
# if defined(OS_WINDOWS)
int local_memory_table_pos = (int)::TlsGetValue(local_storage_key);
@ -1057,7 +1057,7 @@ static volatile int memory_initialized = 0;
/* 2 : Thread */
static void blas_memory_init(){
#if defined(SMP) && !defined(USE_OPENMP)
#if defined(SMP) && !defined(USE_OPENMP_UNUSED)
next_memory_table_pos = 0;
# if !defined(HAS_COMPILER_TLS)
# if defined(OS_WINDOWS)
@ -1279,7 +1279,7 @@ void blas_shutdown(void){
struct alloc_t *alloc_info = local_memory_table[thread][pos];
if (alloc_info) {
alloc_info->release_func(alloc_info);
alloc_info = (void *)0;
local_memory_table[thread][pos] = (void *)0;
}
}
}

View File

@ -35,6 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <string.h>
#if defined(_WIN32) && defined(_MSC_VER)
#if _MSC_VER < 1900
#define snprintf _snprintf
#endif
#endif
static char* openblas_config_str=""
#ifdef USE64BITINT
"USE64BITINT "

View File

@ -1,3 +1,12 @@
CAXPYKERNEL = ../mips/zaxpy.c
ZAXPYKERNEL = ../mips/zaxpy.c
SROTKERNEL = ../mips/rot.c
DROTKERNEL = ../mips/rot.c
CROTKERNEL = ../mips/zrot.c
ZROTKERNEL = ../mips/zrot.c
CSWAPKERNEL = ../mips/zswap.c
ZSWAPKERNEL = ../mips/zswap.c
ifndef SNRM2KERNEL
SNRM2KERNEL = snrm2.S
endif

View File

@ -103,35 +103,83 @@
.align 3
.L12:
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
LD a1, 4 * SIZE(X)
LD b1, 4 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a2, a2
cvt.d.s b2, b2
madd.d s2, s2, a2, b2
#else
MADD s2, s2, a2, b2
#endif
LD a2, 5 * SIZE(X)
LD b2, 5 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a3, a3
cvt.d.s b3, b3
madd.d s1, s1, a3, b3
#else
MADD s1, s1, a3, b3
#endif
LD a3, 6 * SIZE(X)
LD b3, 6 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a4, a4
cvt.d.s b4, b4
madd.d s2, s2, a4, b4
#else
MADD s2, s2, a4, b4
#endif
LD a4, 7 * SIZE(X)
LD b4, 7 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
LD a1, 8 * SIZE(X)
LD b1, 8 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a2, a2
cvt.d.s b2, b2
madd.d s2, s2, a2, b2
#else
MADD s2, s2, a2, b2
#endif
LD a2, 9 * SIZE(X)
LD b2, 9 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a3, a3
cvt.d.s b3, b3
madd.d s1, s1, a3, b3
#else
MADD s1, s1, a3, b3
#endif
LD a3, 10 * SIZE(X)
LD b3, 10 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a4, a4
cvt.d.s b4, b4
madd.d s2, s2, a4, b4
#else
MADD s2, s2, a4, b4
#endif
LD a4, 11 * SIZE(X)
LD b4, 11 * SIZE(Y)
@ -143,29 +191,77 @@
.align 3
.L13:
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
LD a1, 4 * SIZE(X)
LD b1, 4 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a2, a2
cvt.d.s b2, b2
madd.d s2, s2, a2, b2
#else
MADD s2, s2, a2, b2
#endif
LD a2, 5 * SIZE(X)
LD b2, 5 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a3, a3
cvt.d.s b3, b3
madd.d s1, s1, a3, b3
#else
MADD s1, s1, a3, b3
#endif
LD a3, 6 * SIZE(X)
LD b3, 6 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a4, a4
cvt.d.s b4, b4
madd.d s2, s2, a4, b4
#else
MADD s2, s2, a4, b4
#endif
LD a4, 7 * SIZE(X)
LD b4, 7 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
daddiu X, X, 8 * SIZE
#ifdef DSDOT
cvt.d.s a2, a2
cvt.d.s b2, b2
madd.d s2, s2, a2, b2
#else
MADD s2, s2, a2, b2
#endif
daddiu Y, Y, 8 * SIZE
#ifdef DSDOT
cvt.d.s a3, a3
cvt.d.s b3, b3
madd.d s1, s1, a3, b3
#else
MADD s1, s1, a3, b3
#endif
#ifdef DSDOT
cvt.d.s a4, a4
cvt.d.s b4, b4
madd.d s2, s2, a4, b4
#else
MADD s2, s2, a4, b4
#endif
.align 3
.L15:
@ -179,8 +275,13 @@
LD a1, 0 * SIZE(X)
LD b1, 0 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
daddiu I, I, -1
daddiu X, X, SIZE
@ -225,50 +326,85 @@
LD b1, 0 * SIZE(Y)
dadd Y, Y, INCY
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
LD a1, 0 * SIZE(X)
dadd X, X, INCX
LD b1, 0 * SIZE(Y)
dadd Y, Y, INCY
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s2, s2, a1, b1
#else
MADD s2, s2, a1, b1
#endif
LD a1, 0 * SIZE(X)
dadd X, X, INCX
LD b1, 0 * SIZE(Y)
dadd Y, Y, INCY
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
LD a1, 0 * SIZE(X)
dadd X, X, INCX
LD b1, 0 * SIZE(Y)
dadd Y, Y, INCY
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s2, s2, a1, b1
#else
MADD s2, s2, a1, b1
#endif
LD a1, 0 * SIZE(X)
dadd X, X, INCX
LD b1, 0 * SIZE(Y)
dadd Y, Y, INCY
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
LD a1, 0 * SIZE(X)
dadd X, X, INCX
LD b1, 0 * SIZE(Y)
dadd Y, Y, INCY
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s2, s2, a1, b1
#else
MADD s2, s2, a1, b1
#endif
LD a1, 0 * SIZE(X)
dadd X, X, INCX
LD b1, 0 * SIZE(Y)
dadd Y, Y, INCY
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
LD a1, 0 * SIZE(X)
dadd X, X, INCX
LD b1, 0 * SIZE(Y)
@ -277,7 +413,13 @@
daddiu I, I, -1
bgtz I, .L23
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s2, s2, a1, b1
#else
MADD s2, s2, a1, b1
#endif
.align 3
.L25:
@ -296,13 +438,20 @@
daddiu I, I, -1
bgtz I, .L26
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
.align 3
.L999:
ADD s1, s1, s2
#ifdef DSDOT
cvt.d.s s1, s1
add.d s1, s1, s2
#else
ADD s1, s1, s2
#endif
j $31
NOP

View File

@ -84,7 +84,7 @@ struct ctest {
#endif
#if _MSC_VER < 1900
#define snprintf _snprintf_s
#define snprintf _snprintf
#endif
#ifndef __cplusplus