diff --git a/CMakeLists.txt b/CMakeLists.txt index f43e0e0fc..a6cf2ef83 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,10 +29,8 @@ option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding proc else() set(NO_AFFINITY 1) endif() -option(BUILD_SINGLE "Single precision" OFF) -option(BUILD_DOUBLE "Double precision" OFF) -option(BUILD_COMPLEX "Single precision" OFF) -option(BUILD_COMPLEX16 "Single precision" OFF) +option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF) +option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) # Add a prefix or suffix to all exported symbol names in the shared library. # Avoids conflicts with other BLAS libraries, especially when using @@ -91,13 +89,13 @@ if (NOT NO_LAPACK) list(APPEND SUBDIRS lapack) endif () -if (NOT DEFINED BUILD_HALF) - set (BUILD_HALF false) +if (NOT DEFINED BUILD_BFLOAT16) + set (BUILD_BFLOAT16 false) endif () # set which float types we want to build for if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) # if none are defined, build for all -# set(BUILD_HALF true) +# set(BUILD_BFLOAT16 true) set(BUILD_SINGLE true) set(BUILD_DOUBLE true) set(BUILD_COMPLEX true) @@ -110,33 +108,28 @@ endif() set(FLOAT_TYPES "") if (BUILD_SINGLE) - message(STATUS "Building Songle Precision") - list(APPEND FLOAT_TYPES "SINGLE") - # set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1") + message(STATUS "Building Single Precision") + list(APPEND FLOAT_TYPES "SINGLE") # defines nothing endif () if (BUILD_DOUBLE) message(STATUS "Building Double Precision") - list(APPEND FLOAT_TYPES "DOUBLE") - #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE=1") + list(APPEND FLOAT_TYPES "DOUBLE") # defines DOUBLE endif () if (BUILD_COMPLEX) message(STATUS "Building Complex Precision") - list(APPEND FLOAT_TYPES "COMPLEX") - #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX=1") -endif () + list(APPEND FLOAT_TYPES "COMPLEX") # defines COMPLEX +endif () if (BUILD_COMPLEX16) message(STATUS "Building Double Complex Precision") - list(APPEND FLOAT_TYPES "ZCOMPLEX") - #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16=1") + list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE endif () -if (BUILD_HALF) +if (BUILD_BFLOAT16) message(STATUS "Building Half Precision") - list(APPEND FLOAT_TYPES "HALF") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_HALF") + list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing endif () if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") @@ -243,6 +236,9 @@ if (NOT MSVC AND NOT NOFORTRAN) add_subdirectory(ctest) endif() add_subdirectory(lapack-netlib/TESTING) + if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) + add_subdirectory(cpp_thread_test) + endif() endif() set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES diff --git a/Makefile.rule b/Makefile.rule index 09dfb0881..67d183936 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -272,17 +272,33 @@ COMMON_PROF = -pg # work at all. # # CPP_THREAD_SAFETY_TEST = 1 +# +# use this to run only the less memory-hungry GEMV test +# CPP_THREAD_SAFETY_GEMV = 1 # If you want to enable the experimental BFLOAT16 support -# BUILD_HALF = 1 -# -# Select if you need to build only select types -# BUILD_SINGLE = 1 -# BUILD_DOUBLE = 1 -# BUILD_COMPLEX = 1 -# BUILD_COMPLEX16 = 1 -# -# +# BUILD_BFLOAT16 = 1 + + +# Set the thread number threshold beyond which the job array for the threaded level3 BLAS +# will be allocated on the heap rather than the stack. (This array alone requires +# NUM_THREADS*NUM_THREADS*128 bytes of memory so should not pose a problem at low cpu +# counts, but obviously it is not the only item that ends up on the stack. +# The default value of 32 ensures that the overall requirement is compatible +# with the default 1MB stacksize imposed by having the Java VM loaded without use +# of its -Xss parameter. +# The value of 160 formerly used from about version 0.2.7 until 0.3.10 is easily compatible +# with the common Linux stacksize of 8MB but will cause crashes with unwary use of the java +# VM e.g. in Octave or with the java-based libhdfs in numpy or scipy code +# BLAS3_MEM_ALLOC_THRESHOLD = 160 + + + +# the below is not yet configurable, use cmake if you need to build only select types +BUILD_SINGLE = 1 +BUILD_DOUBLE = 1 +BUILD_COMPLEX = 1 +BUILD_COMPLEX16 = 1 # End of user configuration # diff --git a/Makefile.system b/Makefile.system index eb6e14a98..461f7370b 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1232,8 +1232,8 @@ ifeq ($(USE_TLS), 1) CCOMMON_OPT += -DUSE_TLS endif -ifeq ($(BUILD_HALF), 1) -CCOMMON_OPT += -DBUILD_HALF +ifeq ($(BUILD_BFLOAT16), 1) +CCOMMON_OPT += -DBUILD_BFLOAT16 endif ifeq ($(BUILD_SINGLE), 1) CCOMMON_OPT += -DBUILD_SINGLE=1 @@ -1521,10 +1521,10 @@ export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE export NO_AVX512 -export BUILD_HALF +export BUILD_BFLOAT16 -export SHGEMM_UNROLL_M -export SHGEMM_UNROLL_N +export SBGEMM_UNROLL_M +export SBGEMM_UNROLL_N export SGEMM_UNROLL_M export SGEMM_UNROLL_N export DGEMM_UNROLL_M diff --git a/Makefile.tail b/Makefile.tail index 641082450..b14689fc7 100644 --- a/Makefile.tail +++ b/Makefile.tail @@ -24,14 +24,14 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) endif -$(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DHALF -UDOUBLE -UCOMPLEX +$(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX $(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX $(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX $(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX $(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX $(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX $(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX -$(SHEXTOBJS) $(SHEXTOBJS_P) : override CFLAGS += -DHALF -UDOUBLE -UCOMPLEX +$(SHEXTOBJS) $(SHEXTOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX $(SHBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)