diff --git a/CMakeLists.txt b/CMakeLists.txt index b8602da96..20ce02e87 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 2) +set(OpenBLAS_PATCH_VERSION 3.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions @@ -150,6 +150,7 @@ endif() # add objects to the openblas lib add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) +target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $) # Android needs to explicitly link against libm if(ANDROID) @@ -169,6 +170,7 @@ endif() # Set output for libopenblas set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") +set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS") foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG ) diff --git a/Changelog.txt b/Changelog.txt index cb6fee70a..33dcacc51 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,115 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.2 +30-Jul-2018 + +common: + * fixes for regressions caused by the rewrite of the thread + initialization code in 0.3.1 + +POWER: + * fixed cpu autodetection for the BSDs + +MIPS64: + * fixed utest errors in AXPY, DSDOT, ROT and SWAP + +x86_64: + * added autodetection of AMD Ryzen 2 + * fixed build with older versions of MSVC + +==================================================================== +Version 0.3.1 +01-Jul-2018 + +common: + * rewritten thread initialization code with significantly reduced overhead + * added CBLAS interfaces to the IxAMIN BLAS extension functions + * fixed the lapack-test target + * CMAKE builds now create an OpenBLASConfig.cmake file + * ZAXPY now uses a single thread for small input sizes + * the LAPACK code was updated from Reference-LAPACK/lapack#253 + (fixing LAPACKE interfaces to Aasen's functions) + +POWER: + * corrected CROT and ZROT behaviour with zero INC_X + +ARMV7: + * corrected xDOT behaviour with zero INC_X or INC_Y + +x86_64: + * retired some older targets of DYNAMIC_ARCH builds to a new option DYNAMIC_OLDER, + this affects PENRYN,DUNNINGTON,OPTERON,OPTERON_SSE3,BOBCAT,ATOM and NANO + (which will still be supported via the slower PRESCOTT kernels when this option is not set) + * added an option DYNAMIC_LIST that (used in conjunction with DYNAMIC_ARCH) allows to + specify the list of x86_64 targets to include. Any target not on the list will be supported + by the Sandybridge or Nehalem kernels if available, or by Prescott. + * improved SWITCH_RATIO on Haswell for increased GEMM throughput + * added initial support for Intel Skylake X, including an AVX512 SGEMM kernel + * added autodetection of Intel Cannon Lake series as Skylake X + * added a default L2 cache size for hypervisors that return zero here (Chromebook) + * fixed a name clash with recent Windows10 headers that broke the build with (at least) + recent mingw from MSYS2 + * fixed a link error in mixed clang/gfortran builds with OpenMP + * updated the OSX deployment target to 10.8 + * switched on parallel make for builds on MS Windows by default + +x86: + * fixed SSWAP and DSWAP behaviour with zero INC_X and INC_Y + +==================================================================== +Version 0.3.0 +23-May-2108 + +common: + * fixed some more thread race and locking bugs + * added preliminary support for calling an OpenMP build of the library from multiple threads + * removed performance impact of thread locks added in 0.2.20 on OpenMP code + * general code cleanup + * optimized DSDOT implementation + * improved thread distribution for GEMM + * corrected IMATCOPY/OMATCOPY implementation + * fixed out-of-bounds accesses in the multithreaded xBMV/xPMV and SYMV implementations + * cmake build improvements + * pkgconfig file now contains build options + * openblas_get_config() now reports USE_OPENMP and NUM_THREADS settings used for the build + * corrections and improvements for systems with more than 64 cpus + * LAPACK code updated to 3.8.0 including later fixes + * added ReLAPACK, a recursive implementation of several LAPACK functions + * Rewrote ROTMG to handle cases that the netlib code failed to address + * Disabled (broken) multithreading code for xTRMV + * corrected prototypes of complex CBLAS functions to make our cblas.h match the generally accepted standard + * shared memory access failures on startup are now handled more gracefully + * restored utests from earlier releases (and made them pass on all affected systems) + +SPARC: + * several fixes for cpu autodetection + +POWER: + * corrected vector register overwriting in several Power8 kernels + * optimized additional BLAS functions + +ARM: + * added support for CortexA53 and A72 + * added autodetection for ThunderX2T99 + * made most optimized kernels the default for generic ARMv8 targets + +x86_64: + * parallelized DDOT kernel for Haswell + * changed alignment directives in assembly kernels to boost performance on OSX + * fixed register handling in the GEMV microkernels (bug exposed by gcc7) + * added support for building on OpenBSD and Dragonfly + * updated compiler options to work with Intel release 2018 + * support fully optimized build with clang/flang on Microsoft Windows + * fixed building on AIX + +IBM Z: + * added optimized BLAS 1/2 functions + +MIPS: + * fixed cpu autodetection helper code + * added mips32 1004K cpu (Mediatek MT7621 and similar SoC) + * added mips64 I6500 cpu + ==================================================================== Version 0.2.20 24-Jul-2017 diff --git a/Makefile b/Makefile index b947c1198..d99521b19 100644 --- a/Makefile +++ b/Makefile @@ -97,7 +97,7 @@ endif shared : ifndef NO_SHARED -ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) @$(MAKE) -C exports so @ln -fs $(LIBSONAME) $(LIBPREFIX).so @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) @@ -267,6 +267,8 @@ ifeq ($(F_COMPILER), GFORTRAN) ifdef SMP ifeq ($(OSNAME), WINNT) -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc +else ifeq ($(OSNAME), Haiku) + -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc else -@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc endif diff --git a/Makefile.install b/Makefile.install index c51c8a021..fa657beba 100644 --- a/Makefile.install +++ b/Makefile.install @@ -66,7 +66,7 @@ endif #for install shared library ifndef NO_SHARED @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) -ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ diff --git a/Makefile.rule b/Makefile.rule index c205c0c1c..4b815d7a8 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.2 +VERSION = 0.3.3.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library @@ -107,7 +107,13 @@ BUILD_LAPACK_DEPRECATED = 1 # BUILD_RELAPACK = 1 # If you want to use legacy threaded Level 3 implementation. -# USE_SIMPLE_THREADED_LEVEL3 = 1 +USE_SIMPLE_THREADED_LEVEL3 = 1 + +# If you want to use the new, still somewhat experimental code that uses +# thread-local storage instead of a central memory buffer in memory.c +# Note that if your system uses GLIBC, it needs to have at least glibc 2.21 +# for this to work. +USE_TLS = 1 # If you want to drive whole 64bit region by BLAS. Not all Fortran # compiler supports this. It's safe to keep comment it out if you diff --git a/Makefile.system b/Makefile.system index 4712d9525..2123af204 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1018,6 +1018,10 @@ ifdef USE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 endif +ifdef USE_TLS +CCOMMON_OPT += -DUSE_TLS +endif + ifndef SYMBOLPREFIX SYMBOLPREFIX = endif diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 677c05d93..f831b5040 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -12,6 +12,9 @@ ifeq ($(CORE), SKYLAKEX) ifndef NO_AVX512 CCOMMON_OPT += -march=skylake-avx512 FCOMMON_OPT += -march=skylake-avx512 +ifeq ($(OSNAME), CYGWIN_NT) +CCOMMON_OPT += -fno-asynchronous-unwind-tables +endif endif endif diff --git a/README.md b/README.md index 02d087334..9ed9be337 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,7 @@ Please read `GotoBLAS_01Readme.txt`. - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. +- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. @@ -200,6 +201,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. Clang 3.0 will generate the wrong AVX binary code. +* Please use GCC version 6 or LLVM version 6 and above to compile Skyalke AVX512 kernels. * The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`), there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build the library with `BIGNUMA=1`. diff --git a/benchmark/gemv.c b/benchmark/gemv.c index c06e829d9..b6a42f42f 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -122,7 +122,7 @@ int main(int argc, char *argv[]){ FLOAT *a, *x, *y; FLOAT alpha[] = {1.0, 1.0}; - FLOAT beta [] = {1.0, 1.0}; + FLOAT beta [] = {1.0, 0.0}; char trans='N'; blasint m, i, j; blasint inc_x=1,inc_y=1; diff --git a/c_check b/c_check index 3831d7aa3..66acf1cad 100644 --- a/c_check +++ b/c_check @@ -64,6 +64,7 @@ $os = WINNT if ($data =~ /OS_WINNT/); $os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/); $os = Interix if ($data =~ /OS_INTERIX/); $os = Android if ($data =~ /OS_ANDROID/); +$os = Haiku if ($data =~ /OS_HAIKU/); $architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86_64 if ($data =~ /ARCH_X86_64/); @@ -223,7 +224,6 @@ $data =~ /globl\s([_\.]*)(.*)/; $need_fu = $1; $cross = 0; -$cross = 1 if ($os ne $hostos); if ($architecture ne $hostarch) { $cross = 1; @@ -231,6 +231,8 @@ if ($architecture ne $hostarch) { $cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips")); } +$cross = 1 if ($os ne $hostos); + $openmp = "" if $ENV{USE_OPENMP} != 1; $linker_L = ""; diff --git a/cmake/system.cmake b/cmake/system.cmake index 48e8f75bc..18b2c3b87 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -214,6 +214,10 @@ if (CONSISTENT_FPCSR) set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR") endif () +if (USE_TLS) + set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_TLS") +endif () + # Only for development # set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST") # set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST") diff --git a/common.h b/common.h index 663f37e7b..6c3d5b15e 100644 --- a/common.h +++ b/common.h @@ -105,6 +105,10 @@ extern "C" { #endif #endif +#ifdef OS_HAIKU +#define NO_SYSV_IPC +#endif + #ifdef OS_WINDOWS #ifdef ATOM #define GOTO_ATOM ATOM @@ -253,8 +257,14 @@ typedef unsigned long BLASULONG; #ifdef USE64BITINT typedef BLASLONG blasint; +#if defined(OS_WINDOWS) && defined(__64BIT__) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif #else typedef int blasint; +#define blasabs(x) abs(x) #endif #else #ifdef USE64BITINT diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 4e1935429..e0d9221f3 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -29,15 +29,18 @@ #define CPU_GENERIC 0 #define CPU_Z13 1 +#define CPU_Z14 2 static char *cpuname[] = { "ZARCH_GENERIC", - "Z13" + "Z13", + "Z14" }; static char *cpuname_lower[] = { "zarch_generic", - "z13" + "z13", + "z14" }; int detect(void) @@ -62,6 +65,10 @@ int detect(void) if (strstr(p, "2964")) return CPU_Z13; if (strstr(p, "2965")) return CPU_Z13; + /* detect z14, but fall back to z13 */ + if (strstr(p, "3906")) return CPU_Z13; + if (strstr(p, "3907")) return CPU_Z13; + return CPU_GENERIC; } @@ -107,5 +114,9 @@ void get_cpuconfig(void) printf("#define Z13\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); break; + case CPU_Z14: + printf("#define Z14\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + break; } } diff --git a/ctest.c b/ctest.c index 00be423d1..0571e9e02 100644 --- a/ctest.c +++ b/ctest.c @@ -101,6 +101,10 @@ OS_INTERIX OS_LINUX #endif +#if defined(__HAIKU__) +OS_HAIKU +#endif + #if defined(__i386) || defined(_X86) ARCH_X86 #endif diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 794dfb20e..1d7f570d8 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*********************************************************************/ #include "common.h" -#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) +#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU) #include #include #include diff --git a/driver/others/memory.c b/driver/others/memory.c index 959837a52..9d4ab19f5 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -72,6 +72,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //#undef DEBUG #include "common.h" + +#if defined(USE_TLS) +#define COMPILE_TLS +#if defined(__GLIBC_PREREQ) +#if !__GLIBC_PREREQ(2,20) +#undef COMPILE_TLS +#endif +#endif +#endif + +#if defined(COMPILE_TLS) + #include #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) @@ -108,6 +120,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif +#ifdef OS_HAIKU +#include +#endif + #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) #include #include @@ -139,14 +155,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FIXED_PAGESIZE 4096 #endif -#ifndef BUFFERS_PER_THREAD -#ifdef USE_OPENMP_UNUSED -#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) -#else -#define BUFFERS_PER_THREAD NUM_BUFFERS -#endif -#endif - #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) #if defined(_MSC_VER) && !defined(__clang__) @@ -238,6 +246,14 @@ int get_num_procs(void) { } #endif +#ifdef OS_HAIKU +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif + #ifdef OS_WINDOWS int get_num_procs(void) { @@ -420,10 +436,8 @@ int openblas_get_num_threads(void) { int hugetlb_allocated = 0; #if defined(OS_WINDOWS) -#define THREAD_LOCAL __declspec(thread) #define LIKELY_ONE(x) (x) #else -#define THREAD_LOCAL __thread #define LIKELY_ONE(x) (__builtin_expect(x, 1)) #endif @@ -459,62 +473,16 @@ struct alloc_t { for an auxiliary tracking structure. */ static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t); -/* Clang supports TLS from version 2.8 */ -#if defined(__clang__) && __clang_major__ > 2 || \ - (__clang_minor__ == 2 || __clang_minor__ == 8) -#define HAS_COMPILER_TLS -#endif +#if defined(SMP) +# if defined(OS_WINDOWS) +static DWORD local_storage_key = 0; +DWORD lsk; -/* GCC supports TLS from version 4.1 */ -#if !defined(__clang__) && defined(__GNUC__) && \ - (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) -#define HAS_COMPILER_TLS -#endif - -/* MSVC supports TLS from version 2005 */ -#if defined(_MSC_VER) && _MSC_VER >= 1400 -#define HAS_COMPILER_TLS -#endif - -/* Versions of XCode before 8 did not properly support TLS */ -#if defined(__apple_build_version__) && __apple_build_version__ < 8000042 -#undef HAS_COMPILER_TLS -#endif - -/* Android NDK's before version 12b did not support TLS */ -#if defined(__ANDROID__) && defined(__clang__) -#if __has_include() -#include -#endif -#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \ - defined(__NDK_MINOR__) && \ - ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1))) -#undef HAS_COMPILER_TLS -#endif -#endif - -/* Holds pointers to allocated memory */ -#if defined(SMP) && !defined(USE_OPENMP_UNUSED) -/* This is the number of threads than can be spawned by the server, which is the - server plus the number of threads in the thread pool */ -# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER * 2 -static int next_memory_table_pos = 0; -# if defined(HAS_COMPILER_TLS) -/* Use compiler generated thread-local-storage */ -static int THREAD_LOCAL local_memory_table_pos = 0; # else -/* Use system-dependent thread-local-storage */ -# if defined(OS_WINDOWS) -static DWORD local_storage_key; -# else -static pthread_key_t local_storage_key; -# endif /* defined(OS_WINDOWS) */ -# endif /* defined(HAS_COMPILER_TLS) */ -#else -/* There is only one allocating thread when in single-threaded mode and when using OpenMP */ -# define MAX_ALLOCATING_THREADS 1 -#endif /* defined(SMP) && !defined(USE_OPENMP) */ -static struct alloc_t * local_memory_table[MAX_ALLOCATING_THREADS][BUFFERS_PER_THREAD]; +static pthread_key_t local_storage_key = 0; +pthread_key_t lsk; +# endif /* defined(OS_WINDOWS) */ +#endif /* defined(SMP) */ #if defined(OS_LINUX) && !defined(NO_WARMUP) static int hot_alloc = 0; @@ -530,34 +498,54 @@ static pthread_spinlock_t alloc_lock = 0; static BLASULONG alloc_lock = 0UL; #endif +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t key_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t key_lock = 0; +#else +static BLASULONG key_lock = 0UL; +#endif + /* Returns a pointer to the start of the per-thread memory allocation data */ static __inline struct alloc_t ** get_memory_table() { -#if defined(SMP) && !defined(USE_OPENMP_UNUSED) -# if !defined(HAS_COMPILER_TLS) -# if defined(OS_WINDOWS) - int local_memory_table_pos = (int)::TlsGetValue(local_storage_key); -# else - int local_memory_table_pos = (int)pthread_getspecific(local_storage_key); -# endif /* defined(OS_WINDOWS) */ -# endif /* !defined(HAS_COMPILER_TLS) */ - if (!local_memory_table_pos) { - LOCK_COMMAND(&alloc_lock); - local_memory_table_pos = next_memory_table_pos++; - if (next_memory_table_pos > MAX_ALLOCATING_THREADS) - printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n"); - UNLOCK_COMMAND(&alloc_lock); -# if !defined(HAS_COMPILER_TLS) -# if defined(OS_WINDOWS) - ::TlsSetValue(local_storage_key, (void*)local_memory_table_pos); -# else - pthread_setspecific(local_storage_key, (void*)local_memory_table_pos); -# endif /* defined(OS_WINDOWS) */ -# endif /* !defined(HAS_COMPILER_TLS) */ +#if defined(SMP) +LOCK_COMMAND(&key_lock); +lsk=local_storage_key; +UNLOCK_COMMAND(&key_lock); + if (!lsk) { + blas_memory_init(); } - return local_memory_table[local_memory_table_pos]; +# if defined(OS_WINDOWS) + struct alloc_t ** local_memory_table = (struct alloc_t **)TlsGetValue(local_storage_key); +# else + struct alloc_t ** local_memory_table = (struct alloc_t **)pthread_getspecific(local_storage_key); +# endif /* defined(OS_WINDOWS) */ #else - return local_memory_table[0]; -#endif /* defined(SMP) && !defined(USE_OPENMP) */ + static struct alloc_t ** local_memory_table = NULL; +#endif /* defined(SMP) */ +#if defined (SMP) +LOCK_COMMAND(&key_lock); +lsk=local_storage_key; +UNLOCK_COMMAND(&key_lock); + if (lsk && !local_memory_table) { +#else + if (!local_memory_table) { +#endif /* defined(SMP) */ + local_memory_table = (struct alloc_t **)malloc(sizeof(struct alloc_t *) * NUM_BUFFERS); + memset(local_memory_table, 0, sizeof(struct alloc_t *) * NUM_BUFFERS); +#if defined(SMP) +# if defined(OS_WINDOWS) +LOCK_COMMAND(&key_lock); + TlsSetValue(local_storage_key, (void*)local_memory_table); +UNLOCK_COMMAND(&key_lock); +# else +LOCK_COMMAND(&key_lock); + pthread_setspecific(local_storage_key, (void*)local_memory_table); +UNLOCK_COMMAND(&key_lock); +# endif /* defined(OS_WINDOWS) */ +#endif /* defined(SMP) */ + } + return local_memory_table; } #ifdef ALLOC_MMAP @@ -637,7 +625,7 @@ static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) { static void *alloc_mmap(void *address){ void *map_address, *best_address; - BLASULONG best, start, current; + BLASULONG best, start, current, original; BLASULONG allocsize; if (address){ @@ -685,8 +673,9 @@ static void *alloc_mmap(void *address){ start = (BLASULONG)map_address; current = (SCALING - 1) * allocation_block_size; + original = current; - while(current > 0) { + while(current > 0 && current <= original) { *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; start += PAGESIZE; current -= PAGESIZE; @@ -1056,18 +1045,29 @@ static volatile int memory_initialized = 0; /* 1 : Level 2 functions */ /* 2 : Thread */ + static void blas_memory_cleanup(void* ptr){ + if (ptr) { + struct alloc_t ** table = (struct alloc_t **)ptr; + int pos; + for (pos = 0; pos < NUM_BUFFERS; pos ++){ + struct alloc_t *alloc_info = table[pos]; + if (alloc_info) { + alloc_info->release_func(alloc_info); + table[pos] = (void *)0; + } + } + free(table); + } +} + static void blas_memory_init(){ -#if defined(SMP) && !defined(USE_OPENMP_UNUSED) - next_memory_table_pos = 0; -# if !defined(HAS_COMPILER_TLS) -# if defined(OS_WINDOWS) - local_storage_key = ::TlsAlloc(); -# else - pthread_key_create(&local_storage_key, NULL); -# endif /* defined(OS_WINDOWS) */ -# endif /* defined(HAS_COMPILER_TLS) */ -#endif /* defined(SMP) && !defined(USE_OPENMP) */ - memset(local_memory_table, 0, sizeof(local_memory_table)); +#if defined(SMP) +# if defined(OS_WINDOWS) + local_storage_key = TlsAlloc(); +# else + pthread_key_create(&local_storage_key, blas_memory_cleanup); +# endif /* defined(OS_WINDOWS) */ +#endif /* defined(SMP) */ } void *blas_memory_alloc(int procpos){ @@ -1105,7 +1105,16 @@ void *blas_memory_alloc(int procpos){ struct alloc_t * alloc_info; struct alloc_t ** alloc_table; + +#if defined(SMP) && !defined(USE_OPENMP) +int mi; +LOCK_COMMAND(&alloc_lock); +mi=memory_initialized; +UNLOCK_COMMAND(&alloc_lock); + if (!LIKELY_ONE(mi)) { +#else if (!LIKELY_ONE(memory_initialized)) { +#endif #if defined(SMP) && !defined(USE_OPENMP) /* Only allow a single thread to initialize memory system */ LOCK_COMMAND(&alloc_lock); @@ -1149,7 +1158,7 @@ void *blas_memory_alloc(int procpos){ if (!alloc_table[position] || !alloc_table[position]->used) goto allocation; position ++; - } while (position < BUFFERS_PER_THREAD); + } while (position < NUM_BUFFERS); goto error; @@ -1247,7 +1256,7 @@ void blas_memory_free(void *buffer){ #ifdef DEBUG alloc_table = get_memory_table(); - for (position = 0; position < BUFFERS_PER_THREAD; position++){ + for (position = 0; position < NUM_BUFFERS; position++){ if (alloc_table[position]) { printf("%4ld %p : %d\n", position, alloc_table[position], alloc_table[position]->used); } @@ -1267,22 +1276,15 @@ void blas_memory_free_nolock(void * map_address) { } void blas_shutdown(void){ - - int pos, thread; - #ifdef SMP BLASFUNC(blas_thread_shutdown)(); #endif - for (thread = 0; thread < MAX_ALLOCATING_THREADS; thread ++){ - for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){ - struct alloc_t *alloc_info = local_memory_table[thread][pos]; - if (alloc_info) { - alloc_info->release_func(alloc_info); - local_memory_table[thread][pos] = (void *)0; - } - } - } +#ifdef SMP + /* Only cleanupIf we were built for threading and TLS was initialized */ + if (local_storage_key) +#endif + blas_memory_cleanup((void*)get_memory_table()); #ifdef SEEK_ADDRESS base_address = 0UL; @@ -1492,6 +1494,1500 @@ void DESTRUCTOR gotoblas_quit(void) { #endif } +#if defined(_MSC_VER) && !defined(__clang__) +BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved) +{ + switch (ul_reason_for_call) + { + case DLL_PROCESS_ATTACH: + gotoblas_init(); + break; + case DLL_THREAD_ATTACH: + break; + case DLL_THREAD_DETACH: +#if defined(SMP) + blas_memory_cleanup((void*)get_memory_table()); +#endif + break; + case DLL_PROCESS_DETACH: + gotoblas_quit(); + break; + default: + break; + } + return TRUE; +} + +/* + This is to allow static linking. + Code adapted from Google performance tools: + https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc + Reference: + https://sourceware.org/ml/pthreads-win32/2008/msg00028.html + http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp +*/ +static int on_process_term(void) +{ + gotoblas_quit(); + return 0; +} +#ifdef _WIN64 +#pragma comment(linker, "/INCLUDE:_tls_used") +#else +#pragma comment(linker, "/INCLUDE:__tls_used") +#endif + +#ifdef _WIN64 +#pragma const_seg(".CRT$XLB") +#else +#pragma data_seg(".CRT$XLB") +#endif +static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; +#ifdef _WIN64 +#pragma const_seg() +#else +#pragma data_seg() +#endif + +#ifdef _WIN64 +#pragma const_seg(".CRT$XTU") +#else +#pragma data_seg(".CRT$XTU") +#endif +static int(*p_process_term)(void) = on_process_term; +#ifdef _WIN64 +#pragma const_seg() +#else +#pragma data_seg() +#endif +#endif + +#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) +/* Don't call me; this is just work around for PGI / Sun bug */ +void gotoblas_dummy_for_PGI(void) { + + gotoblas_init(); + gotoblas_quit(); + +#if 0 + asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); + asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); +#else + asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); + asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); +#endif +} +#endif + +#else +#include + +#ifdef OS_WINDOWS +#define ALLOC_WINDOWS +#ifndef MEM_LARGE_PAGES +#define MEM_LARGE_PAGES 0x20000000 +#endif +#else +#define ALLOC_MMAP +#define ALLOC_MALLOC +#endif + +#include +#include +#include + +#ifndef OS_WINDOWS +#include +#ifndef NO_SYSV_IPC +#include +#endif +#include +#endif + +#include + +#ifdef OS_LINUX +#include +#include +#include +#include +#include +#include +#include +#endif + +#if defined(OS_FREEBSD) || defined(OS_DARWIN) +#include +#include +#endif + +#if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__)) +#include +#undef printf +#define printf _cprintf +#endif + +#ifdef OS_LINUX + +#ifndef MPOL_PREFERRED +#define MPOL_PREFERRED 1 +#endif + +#endif + +#if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP) +#define NO_WARMUP +#endif + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + +#ifndef FIXED_PAGESIZE +#define FIXED_PAGESIZE 4096 +#endif + +#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) + +#if defined(_MSC_VER) && !defined(__clang__) +#define CONSTRUCTOR __cdecl +#define DESTRUCTOR __cdecl +#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) +#else +#define CONSTRUCTOR __attribute__ ((constructor(101))) +#define DESTRUCTOR __attribute__ ((destructor(101))) +#endif + +#ifdef DYNAMIC_ARCH +gotoblas_t *gotoblas = NULL; +#endif +extern void openblas_warning(int verbose, const char * msg); + +#ifndef SMP + +#define blas_cpu_number 1 +#define blas_num_threads 1 + +/* Dummy Function */ +int goto_get_num_procs (void) { return 1;}; +void goto_set_num_threads(int num_threads) {}; + +#else + +#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD) +#ifndef NO_AFFINITY +int get_num_procs(void); +#else +int get_num_procs(void) { + static int nums = 0; +cpu_set_t *cpusetp; +size_t size; +int ret; +int i,n; + + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); +#if !defined(OS_LINUX) + return nums; +#endif + +#if !defined(__GLIBC_PREREQ) + return nums; +#else + #if !__GLIBC_PREREQ(2, 3) + return nums; + #endif + + #if !__GLIBC_PREREQ(2, 7) + ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); + if (ret!=0) return nums; + n=0; + #if !__GLIBC_PREREQ(2, 6) + for (i=0;i 0) blas_num_threads = blas_goto_num; + else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; + else blas_num_threads = MAX_CPU_NUMBER; + +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) + if (blas_num_threads > max_num) blas_num_threads = max_num; +#endif + + if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER; + +#ifdef DEBUG + printf( "Adjusted number of threads : %3d\n", blas_num_threads); +#endif + + blas_cpu_number = blas_num_threads; + + return blas_num_threads; +} +#endif + + +int openblas_get_num_procs(void) { +#ifndef SMP + return 1; +#else + return get_num_procs(); +#endif +} + +int openblas_get_num_threads(void) { +#ifndef SMP + return 1; +#else + // init blas_cpu_number if needed + blas_get_cpu_number(); + return blas_cpu_number; +#endif +} + +struct release_t { + void *address; + void (*func)(struct release_t *); + long attr; +}; + +int hugetlb_allocated = 0; + +static struct release_t release_info[NUM_BUFFERS]; +static int release_pos = 0; + +#if defined(OS_LINUX) && !defined(NO_WARMUP) +static int hot_alloc = 0; +#endif + +/* Global lock for memory allocation */ + +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t alloc_lock = 0; +#else +static BLASULONG alloc_lock = 0UL; +#endif + +#ifdef ALLOC_MMAP + +static void alloc_mmap_free(struct release_t *release){ + + if (munmap(release -> address, BUFFER_SIZE)) { + printf("OpenBLAS : munmap failed\n"); + } +} + + + +#ifdef NO_WARMUP + +static void *alloc_mmap(void *address){ + void *map_address; + + if (address){ + map_address = mmap(address, + BUFFER_SIZE, + MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); + } else { + map_address = mmap(address, + BUFFER_SIZE, + MMAP_ACCESS, MMAP_POLICY, -1, 0); + } + + if (map_address != (void *)-1) { + LOCK_COMMAND(&alloc_lock); + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_mmap_free; + release_pos ++; + UNLOCK_COMMAND(&alloc_lock); + } + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + return map_address; +} + +#else + +#define BENCH_ITERATION 4 +#define SCALING 2 + +static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) { + + BLASULONG original, *p; + BLASULONG start, stop, min; + int iter, i, count; + + min = (BLASULONG)-1; + + original = *(BLASULONG *)(address + size - PAGESIZE); + + *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address; + + for (iter = 0; iter < BENCH_ITERATION; iter ++ ) { + + p = (BLASULONG *)address; + + count = size / PAGESIZE; + + start = rpcc(); + + for (i = 0; i < count; i ++) { + p = (BLASULONG *)(*p); + } + + stop = rpcc(); + + if (min > stop - start) min = stop - start; + } + + *(BLASULONG *)(address + size - PAGESIZE + 0) = original; + *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p; + + return min; +} + +static void *alloc_mmap(void *address){ + void *map_address, *best_address; + BLASULONG best, start, current; + BLASULONG allocsize; + + if (address){ + /* Just give up use advanced operation */ + map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + } else { +#if defined(OS_LINUX) && !defined(NO_WARMUP) + if (hot_alloc == 0) { + map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0); + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + } else { +#endif + + map_address = mmap(NULL, BUFFER_SIZE * SCALING, + MMAP_ACCESS, MMAP_POLICY, -1, 0); + + if (map_address != (void *)-1) { + +#ifdef OS_LINUX +#ifdef DEBUG + int ret=0; + ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); + if(ret==-1){ + int errsv=errno; + perror("OpenBLAS alloc_mmap:"); + printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); + } + +#else + my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); +#endif +#endif + + + allocsize = DGEMM_P * DGEMM_Q * sizeof(double); + + start = (BLASULONG)map_address; + current = (SCALING - 1) * BUFFER_SIZE; + + while(current > 0) { + *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; + start += PAGESIZE; + current -= PAGESIZE; + } + + *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address; + + start = (BLASULONG)map_address; + + best = (BLASULONG)-1; + best_address = map_address; + + while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) { + + current = run_bench(start, allocsize); + + if (best > current) { + best = current; + best_address = (void *)start; + } + + start += PAGESIZE; + + } + + if ((BLASULONG)best_address > (BLASULONG)map_address) + munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address); + + munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address); + + map_address = best_address; + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + hot_alloc = 2; +#endif + } + } +#if defined(OS_LINUX) && !defined(NO_WARMUP) + } +#endif + LOCK_COMMAND(&alloc_lock); + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_mmap_free; + release_pos ++; + } + UNLOCK_COMMAND(&alloc_lock); + + return map_address; +} + +#endif + +#endif + + +#ifdef ALLOC_MALLOC + +static void alloc_malloc_free(struct release_t *release){ + + free(release -> address); + +} + +static void *alloc_malloc(void *address){ + + void *map_address; + + map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE); + + if (map_address == (void *)NULL) map_address = (void *)-1; + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_malloc_free; + release_pos ++; + } + + return map_address; + +} + +#endif + +#ifdef ALLOC_QALLOC + +void *qalloc(int flags, size_t bytes); +void *qfree (void *address); + +#define QNONCACHE 0x1 +#define QCOMMS 0x2 +#define QFAST 0x4 + +static void alloc_qalloc_free(struct release_t *release){ + + qfree(release -> address); + +} + +static void *alloc_qalloc(void *address){ + void *map_address; + + map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE); + + if (map_address == (void *)NULL) map_address = (void *)-1; + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_qalloc_free; + release_pos ++; + } + + return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1)); +} + +#endif + +#ifdef ALLOC_WINDOWS + +static void alloc_windows_free(struct release_t *release){ + + VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT); + +} + +static void *alloc_windows(void *address){ + void *map_address; + + map_address = VirtualAlloc(address, + BUFFER_SIZE, + MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE); + + if (map_address == (void *)NULL) map_address = (void *)-1; + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_windows_free; + release_pos ++; + } + + return map_address; +} + +#endif + +#ifdef ALLOC_DEVICEDRIVER +#ifndef DEVICEDRIVER_NAME +#define DEVICEDRIVER_NAME "/dev/mapper" +#endif + +static void alloc_devicedirver_free(struct release_t *release){ + + if (munmap(release -> address, BUFFER_SIZE)) { + printf("OpenBLAS : Bugphysarea unmap failed.\n"); + } + + if (close(release -> attr)) { + printf("OpenBLAS : Bugphysarea close failed.\n"); + } + +} + +static void *alloc_devicedirver(void *address){ + + int fd; + void *map_address; + + if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) { + + return (void *)-1; + + } + + map_address = mmap(address, BUFFER_SIZE, + PROT_READ | PROT_WRITE, + MAP_FILE | MAP_SHARED, + fd, 0); + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].attr = fd; + release_info[release_pos].func = alloc_devicedirver_free; + release_pos ++; + } + + return map_address; +} + +#endif + +#ifdef ALLOC_SHM + +static void alloc_shm_free(struct release_t *release){ + + if (shmdt(release -> address)) { + printf("OpenBLAS : Shared memory unmap failed.\n"); + } +} + +static void *alloc_shm(void *address){ + void *map_address; + int shmid; + + shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600); + + map_address = (void *)shmat(shmid, address, 0); + + if (map_address != (void *)-1){ + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + shmctl(shmid, IPC_RMID, 0); + + release_info[release_pos].address = map_address; + release_info[release_pos].attr = shmid; + release_info[release_pos].func = alloc_shm_free; + release_pos ++; + } + + return map_address; +} + +#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS + +static void alloc_hugetlb_free(struct release_t *release){ + +#if defined(OS_LINUX) || defined(OS_AIX) + if (shmdt(release -> address)) { + printf("OpenBLAS : Hugepage unmap failed.\n"); + } +#endif + +#ifdef __sun__ + + munmap(release -> address, BUFFER_SIZE); + +#endif + +#ifdef OS_WINDOWS + + VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT); + +#endif + +} + +static void *alloc_hugetlb(void *address){ + + void *map_address = (void *)-1; + +#if defined(OS_LINUX) || defined(OS_AIX) + int shmid; + + shmid = shmget(IPC_PRIVATE, BUFFER_SIZE, +#ifdef OS_LINUX + SHM_HUGETLB | +#endif +#ifdef OS_AIX + SHM_LGPAGE | SHM_PIN | +#endif + IPC_CREAT | SHM_R | SHM_W); + + if (shmid != -1) { + map_address = (void *)shmat(shmid, address, SHM_RND); + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + if (map_address != (void *)-1){ + shmctl(shmid, IPC_RMID, 0); + } + } +#endif + +#ifdef __sun__ + struct memcntl_mha mha; + + mha.mha_cmd = MHA_MAPSIZE_BSSBRK; + mha.mha_flags = 0; + mha.mha_pagesize = HUGE_PAGESIZE; + memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0); + + map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE); +#endif + +#ifdef OS_WINDOWS + + HANDLE hToken; + TOKEN_PRIVILEGES tp; + + if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1; + + tp.PrivilegeCount = 1; + tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + + if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) { + CloseHandle(hToken); + return (void*)-1; + } + + if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) { + CloseHandle(hToken); + return (void*)-1; + } + + map_address = (void *)VirtualAlloc(address, + BUFFER_SIZE, + MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE); + + tp.Privileges[0].Attributes = 0; + AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL); + + if (map_address == (void *)NULL) map_address = (void *)-1; + +#endif + + if (map_address != (void *)-1){ + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_hugetlb_free; + release_pos ++; + } + + return map_address; +} +#endif + +#endif + +#ifdef ALLOC_HUGETLBFILE + +static int hugetlb_pid = 0; + +static void alloc_hugetlbfile_free(struct release_t *release){ + + if (munmap(release -> address, BUFFER_SIZE)) { + printf("OpenBLAS : HugeTLBfs unmap failed.\n"); + } + + if (close(release -> attr)) { + printf("OpenBLAS : HugeTLBfs close failed.\n"); + } +} + +static void *alloc_hugetlbfile(void *address){ + + void *map_address = (void *)-1; + int fd; + char filename[64]; + + if (!hugetlb_pid) hugetlb_pid = getpid(); + + sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid); + + if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) { + return (void *)-1; + } + + unlink(filename); + + map_address = mmap(address, BUFFER_SIZE, + PROT_READ | PROT_WRITE, + MAP_SHARED, + fd, 0); + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].attr = fd; + release_info[release_pos].func = alloc_hugetlbfile_free; + release_pos ++; + } + + return map_address; +} +#endif + + +#ifdef SEEK_ADDRESS +static BLASULONG base_address = 0UL; +#else +static BLASULONG base_address = BASE_ADDRESS; +#endif + +static volatile struct { + BLASULONG lock; + void *addr; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + int pos; +#endif + int used; +#ifndef __64BIT__ + char dummy[48]; +#else + char dummy[40]; +#endif + +} memory[NUM_BUFFERS]; + +static int memory_initialized = 0; + +/* Memory allocation routine */ +/* procpos ... indicates where it comes from */ +/* 0 : Level 3 functions */ +/* 1 : Level 2 functions */ +/* 2 : Thread */ + +void *blas_memory_alloc(int procpos){ + + int position; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + int mypos; +#endif + + void *map_address; + + void *(*memoryalloc[])(void *address) = { +#ifdef ALLOC_DEVICEDRIVER + alloc_devicedirver, +#endif +/* Hugetlb implicitly assumes ALLOC_SHM */ +#ifdef ALLOC_SHM + alloc_shm, +#endif +#if ((defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)) + alloc_hugetlb, +#endif +#ifdef ALLOC_MMAP + alloc_mmap, +#endif +#ifdef ALLOC_QALLOC + alloc_qalloc, +#endif +#ifdef ALLOC_WINDOWS + alloc_windows, +#endif +#ifdef ALLOC_MALLOC + alloc_malloc, +#endif + NULL, + }; + void *(**func)(void *address); + LOCK_COMMAND(&alloc_lock); + + if (!memory_initialized) { + +#if defined(WHEREAMI) && !defined(USE_OPENMP) + for (position = 0; position < NUM_BUFFERS; position ++){ + memory[position].addr = (void *)0; + memory[position].pos = -1; + memory[position].used = 0; + memory[position].lock = 0; + } +#endif + +#ifdef DYNAMIC_ARCH + gotoblas_dynamic_init(); +#endif + +#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) + gotoblas_affinity_init(); +#endif + +#ifdef SMP + if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); +#endif + +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) +#ifndef DYNAMIC_ARCH + blas_set_parameter(); +#endif +#endif + + memory_initialized = 1; + + } + UNLOCK_COMMAND(&alloc_lock); + +#ifdef DEBUG + printf("Alloc Start ...\n"); +#endif + +#if defined(WHEREAMI) && !defined(USE_OPENMP) + + mypos = WhereAmI(); + + position = mypos; + while (position >= NUM_BUFFERS) position >>= 1; + + do { + if (!memory[position].used && (memory[position].pos == mypos)) { + LOCK_COMMAND(&alloc_lock); +/* blas_lock(&memory[position].lock);*/ + + if (!memory[position].used) goto allocation; + + UNLOCK_COMMAND(&alloc_lock); +/* blas_unlock(&memory[position].lock);*/ + } + + position ++; + + } while (position < NUM_BUFFERS); + + +#endif + + position = 0; + + do { +/* if (!memory[position].used) { */ + LOCK_COMMAND(&alloc_lock); +/* blas_lock(&memory[position].lock);*/ + + if (!memory[position].used) goto allocation; + + UNLOCK_COMMAND(&alloc_lock); +/* blas_unlock(&memory[position].lock);*/ +/* } */ + + position ++; + + } while (position < NUM_BUFFERS); + + goto error; + + allocation : + +#ifdef DEBUG + printf(" Position -> %d\n", position); +#endif + + memory[position].used = 1; + + UNLOCK_COMMAND(&alloc_lock); +/* blas_unlock(&memory[position].lock);*/ + + if (!memory[position].addr) { + do { +#ifdef DEBUG + printf("Allocation Start : %lx\n", base_address); +#endif + + map_address = (void *)-1; + + func = &memoryalloc[0]; + + while ((func != NULL) && (map_address == (void *) -1)) { + + map_address = (*func)((void *)base_address); + +#ifdef ALLOC_DEVICEDRIVER + if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { + fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n"); + } +#endif + +#ifdef ALLOC_HUGETLBFILE + if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { +#ifndef OS_WINDOWS + fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n"); +#endif + } +#endif + +#if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) + if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; +#endif + + func ++; + } + +#ifdef DEBUG + printf(" Success -> %08lx\n", map_address); +#endif + if (((BLASLONG) map_address) == -1) base_address = 0UL; + + if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE; + + } while ((BLASLONG)map_address == -1); + + LOCK_COMMAND(&alloc_lock); + memory[position].addr = map_address; + UNLOCK_COMMAND(&alloc_lock); + +#ifdef DEBUG + printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); +#endif + } + +#if defined(WHEREAMI) && !defined(USE_OPENMP) + + if (memory[position].pos == -1) memory[position].pos = mypos; + +#endif + +#ifdef DYNAMIC_ARCH + + if (memory_initialized == 1) { + + LOCK_COMMAND(&alloc_lock); + + if (memory_initialized == 1) { + + if (!gotoblas) gotoblas_dynamic_init(); + + memory_initialized = 2; + } + + UNLOCK_COMMAND(&alloc_lock); + + } +#endif + + +#ifdef DEBUG + printf("Mapped : %p %3d\n\n", + (void *)memory[position].addr, position); +#endif + + return (void *)memory[position].addr; + + error: + printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); + + return NULL; +} + +void blas_memory_free(void *free_area){ + + int position; + +#ifdef DEBUG + printf("Unmapped Start : %p ...\n", free_area); +#endif + + position = 0; + LOCK_COMMAND(&alloc_lock); + + while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) + position++; + + if (memory[position].addr != free_area) goto error; + +#ifdef DEBUG + printf(" Position : %d\n", position); +#endif + + // arm: ensure all writes are finished before other thread takes this memory + WMB; + + memory[position].used = 0; + UNLOCK_COMMAND(&alloc_lock); + +#ifdef DEBUG + printf("Unmap Succeeded.\n\n"); +#endif + + return; + + error: + printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); + +#ifdef DEBUG + for (position = 0; position < NUM_BUFFERS; position++) + printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); +#endif + UNLOCK_COMMAND(&alloc_lock); + + return; +} + +void *blas_memory_alloc_nolock(int unused) { + void *map_address; + map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE); + return map_address; +} + +void blas_memory_free_nolock(void * map_address) { + free(map_address); +} + +void blas_shutdown(void){ + + int pos; + +#ifdef SMP + BLASFUNC(blas_thread_shutdown)(); +#endif + + LOCK_COMMAND(&alloc_lock); + + for (pos = 0; pos < release_pos; pos ++) { + release_info[pos].func(&release_info[pos]); + } + +#ifdef SEEK_ADDRESS + base_address = 0UL; +#else + base_address = BASE_ADDRESS; +#endif + + for (pos = 0; pos < NUM_BUFFERS; pos ++){ + memory[pos].addr = (void *)0; + memory[pos].used = 0; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + memory[pos].pos = -1; +#endif + memory[pos].lock = 0; + } + + UNLOCK_COMMAND(&alloc_lock); + + return; +} + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + +#ifdef SMP +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t init_lock = 0; +#else +static BLASULONG init_lock = 0UL; +#endif +#endif + +static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, + void *sa, void *sb, BLASLONG pos) { + +#if !defined(ARCH_POWER) && !defined(ARCH_SPARC) + + size_t size; + BLASULONG buffer; + + size = BUFFER_SIZE - PAGESIZE; + buffer = (BLASULONG)sa + GEMM_OFFSET_A; + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + if (hot_alloc != 2) { +#endif + +#ifdef SMP + LOCK_COMMAND(&init_lock); +#endif + + while (size > 0) { + *(int *)buffer = size; + buffer += PAGESIZE; + size -= PAGESIZE; + } + +#ifdef SMP + UNLOCK_COMMAND(&init_lock); +#endif + + size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE); + buffer = (BLASULONG)sa + GEMM_OFFSET_A; + + while (size > 0) { + *(int *)buffer = size; + buffer += 64; + size -= 64; + } + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + } +#endif + +#endif +} + +#ifdef SMP + +static void _init_thread_memory(void *buffer) { + + blas_queue_t queue[MAX_CPU_NUMBER]; + int num_cpu; + + for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) { + + blas_queue_init(&queue[num_cpu]); + queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL; + queue[num_cpu].routine = &_touch_memory; + queue[num_cpu].args = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + } + + queue[num_cpu - 1].next = NULL; + queue[0].sa = buffer; + + exec_blas(num_cpu, queue); + +} +#endif + +static void gotoblas_memory_init(void) { + + void *buffer; + + hot_alloc = 1; + + buffer = (void *)blas_memory_alloc(0); + +#ifdef SMP + if (blas_cpu_number == 0) blas_get_cpu_number(); +#ifdef SMP_SERVER + if (blas_server_avail == 0) blas_thread_init(); +#endif + + _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A)); + +#else + + _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0); + +#endif + + blas_memory_free(buffer); +} +#endif + +/* Initialization for all function; this function should be called before main */ + +static int gotoblas_initialized = 0; +extern void openblas_read_env(); + +void CONSTRUCTOR gotoblas_init(void) { + + if (gotoblas_initialized) return; + +#ifdef SMP + openblas_fork_handler(); +#endif + + openblas_read_env(); + +#ifdef PROFILE + moncontrol (0); +#endif + +#ifdef DYNAMIC_ARCH + gotoblas_dynamic_init(); +#endif + +#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) + gotoblas_affinity_init(); +#endif + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + gotoblas_memory_init(); +#endif + +//#if defined(OS_LINUX) +#if 0 + struct rlimit curlimit; + if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 ) + { + if ( curlimit.rlim_cur != curlimit.rlim_max ) + { + curlimit.rlim_cur = curlimit.rlim_max; + setrlimit(RLIMIT_STACK, &curlimit); + } + } +#endif + +#ifdef SMP + if (blas_cpu_number == 0) blas_get_cpu_number(); +#ifdef SMP_SERVER + if (blas_server_avail == 0) blas_thread_init(); +#endif +#endif + +#ifdef FUNCTION_PROFILE + gotoblas_profile_init(); +#endif + + gotoblas_initialized = 1; + +#ifdef PROFILE + moncontrol (1); +#endif + +} + +void DESTRUCTOR gotoblas_quit(void) { + + if (gotoblas_initialized == 0) return; + + blas_shutdown(); + +#ifdef PROFILE + moncontrol (0); +#endif + +#ifdef FUNCTION_PROFILE + gotoblas_profile_quit(); +#endif + +#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) + gotoblas_affinity_quit(); +#endif + +#ifdef DYNAMIC_ARCH + gotoblas_dynamic_quit(); +#endif + + gotoblas_initialized = 0; + +#ifdef PROFILE + moncontrol (1); +#endif +} + #if defined(_MSC_VER) && !defined(__clang__) BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved) { @@ -1573,3 +3069,5 @@ void gotoblas_dummy_for_PGI(void) { #endif } #endif + +#endif diff --git a/exports/Makefile b/exports/Makefile index 127b05057..29075a9c2 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -122,7 +122,7 @@ endif dllinit.$(SUFFIX) : dllinit.c $(CC) $(CFLAGS) -c -o $(@F) -s $< -ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) so : ../$(LIBSONAME) diff --git a/interface/gbmv.c b/interface/gbmv.c index 096c9f6f2..1d58ba807 100644 --- a/interface/gbmv.c +++ b/interface/gbmv.c @@ -213,7 +213,7 @@ void CNAME(enum CBLAS_ORDER order, if (trans) lenx = m; if (trans) leny = n; - if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; diff --git a/interface/gemv.c b/interface/gemv.c index 30709e361..c9d52cd69 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -199,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order, if (trans) lenx = m; if (trans) leny = n; - if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; diff --git a/interface/rotg.c b/interface/rotg.c index 092554299..69443a5a0 100644 --- a/interface/rotg.c +++ b/interface/rotg.c @@ -22,8 +22,8 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ long double s; long double r, roe, z; - long double ada = fabs(da); - long double adb = fabs(db); + long double ada = fabsl(da); + long double adb = fabsl(db); long double scale = ada + adb; #ifndef CBLAS diff --git a/interface/sbmv.c b/interface/sbmv.c index 761a9a0d0..25e99ca34 100644 --- a/interface/sbmv.c +++ b/interface/sbmv.c @@ -184,7 +184,7 @@ void CNAME(enum CBLAS_ORDER order, if (n == 0) return; - if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; diff --git a/interface/spmv.c b/interface/spmv.c index 403458b06..e08ae3f6e 100644 --- a/interface/spmv.c +++ b/interface/spmv.c @@ -168,7 +168,7 @@ void CNAME(enum CBLAS_ORDER order, if (n == 0) return; - if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; diff --git a/interface/symv.c b/interface/symv.c index e4e300e20..07bd20022 100644 --- a/interface/symv.c +++ b/interface/symv.c @@ -166,7 +166,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, if (n == 0) return; - if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; diff --git a/interface/zgbmv.c b/interface/zgbmv.c index a04be2fbf..5e275a8ed 100644 --- a/interface/zgbmv.c +++ b/interface/zgbmv.c @@ -237,7 +237,7 @@ void CNAME(enum CBLAS_ORDER order, if (trans & 1) lenx = m; if (trans & 1) leny = n; - if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) return; diff --git a/interface/zgemv.c b/interface/zgemv.c index 0c75564f0..3e98dba7f 100644 --- a/interface/zgemv.c +++ b/interface/zgemv.c @@ -225,7 +225,7 @@ void CNAME(enum CBLAS_ORDER order, if (trans & 1) lenx = m; if (trans & 1) leny = n; - if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) return; diff --git a/interface/zhbmv.c b/interface/zhbmv.c index 9ad1b53a1..656f137c6 100644 --- a/interface/zhbmv.c +++ b/interface/zhbmv.c @@ -190,7 +190,7 @@ void CNAME(enum CBLAS_ORDER order, if (n == 0) return; - if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; diff --git a/interface/zhemv.c b/interface/zhemv.c index 2aee880dc..d1996ad69 100644 --- a/interface/zhemv.c +++ b/interface/zhemv.c @@ -181,7 +181,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA if (n == 0) return; - if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; diff --git a/interface/zhpmv.c b/interface/zhpmv.c index b72a6d670..ff49716b5 100644 --- a/interface/zhpmv.c +++ b/interface/zhpmv.c @@ -180,7 +180,7 @@ void CNAME(enum CBLAS_ORDER order, if (n == 0) return; - if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; diff --git a/interface/zrotg.c b/interface/zrotg.c index 187343d41..8caa411fc 100644 --- a/interface/zrotg.c +++ b/interface/zrotg.c @@ -14,7 +14,7 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ long double db_i = *(DB + 1); long double r; - long double ada = fabs(da_r) + fabs(da_i); + long double ada = fabsl(da_r) + fabsl(da_i); PRINT_DEBUG_NAME; diff --git a/interface/zsbmv.c b/interface/zsbmv.c index b71d4c519..cd5cefed9 100644 --- a/interface/zsbmv.c +++ b/interface/zsbmv.c @@ -126,7 +126,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint * if (n == 0) return; - if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0); + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, blasabs(incy), NULL, 0, NULL, 0); if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index b37e536ef..9258f216d 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -44,7 +44,7 @@ ifeq ($(CORE), POWER8) USE_TRMM = 1 endif -ifeq ($(CORE), Z13) +ifeq ($(ARCH), zarch) USE_TRMM = 1 endif diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index b4acdccd2..cde5bdaa6 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_steamroller-2.c" #elif defined(PILEDRIVER) #include "daxpy_microk_piledriver-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "daxpy_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "daxpy_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "daxpy_microk_sandy-2.c" #endif diff --git a/kernel/x86_64/daxpy_microk_skylakex-2.c b/kernel/x86_64/daxpy_microk_skylakex-2.c new file mode 100644 index 000000000..e785a39f1 --- /dev/null +++ b/kernel/x86_64/daxpy_microk_skylakex-2.c @@ -0,0 +1,71 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#include + +#define HAVE_KERNEL_8 1 + +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i = 0; + + __m256d __alpha; + + __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); + +#ifdef __AVX512CD__ + BLASLONG n32; + __m512d __alpha5; + __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha)); + + n32 = n & ~31; + + for (; i < n32; i+= 32) { + _mm512_storeu_pd(&y[i + 0], _mm512_loadu_pd(&y[i + 0]) + __alpha5 * _mm512_loadu_pd(&x[i + 0])); + _mm512_storeu_pd(&y[i + 8], _mm512_loadu_pd(&y[i + 8]) + __alpha5 * _mm512_loadu_pd(&x[i + 8])); + _mm512_storeu_pd(&y[i + 16], _mm512_loadu_pd(&y[i + 16]) + __alpha5 * _mm512_loadu_pd(&x[i + 16])); + _mm512_storeu_pd(&y[i + 24], _mm512_loadu_pd(&y[i + 24]) + __alpha5 * _mm512_loadu_pd(&x[i + 24])); + } + +#endif + + for (; i < n; i+= 16) { + _mm256_storeu_pd(&y[i + 0], _mm256_loadu_pd(&y[i + 0]) + __alpha * _mm256_loadu_pd(&x[i + 0])); + _mm256_storeu_pd(&y[i + 4], _mm256_loadu_pd(&y[i + 4]) + __alpha * _mm256_loadu_pd(&x[i + 4])); + _mm256_storeu_pd(&y[i + 8], _mm256_loadu_pd(&y[i + 8]) + __alpha * _mm256_loadu_pd(&x[i + 8])); + _mm256_storeu_pd(&y[i + 12], _mm256_loadu_pd(&y[i + 12]) + __alpha * _mm256_loadu_pd(&x[i + 12])); + } +} +#else +#include "daxpy_microk_haswell-2.c" +#endif + + diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 0dc9cd3da..969357614 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ddot_microk_piledriver-2.c" #elif defined(NEHALEM) #include "ddot_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "ddot_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "ddot_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "ddot_microk_sandy-2.c" #endif diff --git a/kernel/x86_64/ddot_microk_skylakex-2.c b/kernel/x86_64/ddot_microk_skylakex-2.c new file mode 100644 index 000000000..8eabf225a --- /dev/null +++ b/kernel/x86_64/ddot_microk_skylakex-2.c @@ -0,0 +1,96 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_KERNEL_8 1 + +#include + +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + int i = 0; + __m256d accum_0, accum_1, accum_2, accum_3; + + accum_0 = _mm256_setzero_pd(); + accum_1 = _mm256_setzero_pd(); + accum_2 = _mm256_setzero_pd(); + accum_3 = _mm256_setzero_pd(); + +#ifdef __AVX512CD__ + __m512d accum_05, accum_15, accum_25, accum_35; + int n32; + n32 = n & (~31); + + accum_05 = _mm512_setzero_pd(); + accum_15 = _mm512_setzero_pd(); + accum_25 = _mm512_setzero_pd(); + accum_35 = _mm512_setzero_pd(); + + for (; i < n32; i += 32) { + accum_05 += _mm512_loadu_pd(&x[i+ 0]) * _mm512_loadu_pd(&y[i+ 0]); + accum_15 += _mm512_loadu_pd(&x[i+ 8]) * _mm512_loadu_pd(&y[i+ 8]); + accum_25 += _mm512_loadu_pd(&x[i+16]) * _mm512_loadu_pd(&y[i+16]); + accum_35 += _mm512_loadu_pd(&x[i+24]) * _mm512_loadu_pd(&y[i+24]); + } + + /* + * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code + * below can continue using the intermediate results in its loop + */ + accum_0 = _mm512_extractf64x4_pd(accum_05, 0) + _mm512_extractf64x4_pd(accum_05, 1); + accum_1 = _mm512_extractf64x4_pd(accum_15, 0) + _mm512_extractf64x4_pd(accum_15, 1); + accum_2 = _mm512_extractf64x4_pd(accum_25, 0) + _mm512_extractf64x4_pd(accum_25, 1); + accum_3 = _mm512_extractf64x4_pd(accum_35, 0) + _mm512_extractf64x4_pd(accum_35, 1); + +#endif + for (; i < n; i += 16) { + accum_0 += _mm256_loadu_pd(&x[i+ 0]) * _mm256_loadu_pd(&y[i+ 0]); + accum_1 += _mm256_loadu_pd(&x[i+ 4]) * _mm256_loadu_pd(&y[i+ 4]); + accum_2 += _mm256_loadu_pd(&x[i+ 8]) * _mm256_loadu_pd(&y[i+ 8]); + accum_3 += _mm256_loadu_pd(&x[i+12]) * _mm256_loadu_pd(&y[i+12]); + } + + /* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */ + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + + __m128d half_accum0; + + /* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */ + half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); + + /* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */ + half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); + + *dot = half_accum0[0]; +} + +#else +#include "ddot_microk_haswell-2.c" +#endif diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 309fbe767..6d2530e81 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "dgemv_n_microk_nehalem-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dgemv_n_microk_haswell-4.c" +#elif defined (SKYLAKEX) +#include "dgemv_n_microk_skylakex-4.c" #endif diff --git a/kernel/x86_64/dgemv_n_microk_skylakex-4.c b/kernel/x86_64/dgemv_n_microk_skylakex-4.c new file mode 100644 index 000000000..4030399ab --- /dev/null +++ b/kernel/x86_64/dgemv_n_microk_skylakex-4.c @@ -0,0 +1,126 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_KERNEL_4x4 1 + +#include + +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + int i = 0; + + __m256d x0, x1, x2, x3; + __m256d __alpha; + + x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0])); + x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1])); + x2 = _mm256_broadcastsd_pd(_mm_load_sd(&x[2])); + x3 = _mm256_broadcastsd_pd(_mm_load_sd(&x[3])); + + __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); + +#ifdef __AVX512CD__ + int n5; + __m512d x05, x15, x25, x35; + __m512d __alpha5; + n5 = n & ~7; + + x05 = _mm512_broadcastsd_pd(_mm_load_sd(&x[0])); + x15 = _mm512_broadcastsd_pd(_mm_load_sd(&x[1])); + x25 = _mm512_broadcastsd_pd(_mm_load_sd(&x[2])); + x35 = _mm512_broadcastsd_pd(_mm_load_sd(&x[3])); + + __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha)); + + for (; i < n5; i+= 8) { + __m512d tempY; + __m512d sum; + + sum = _mm512_loadu_pd(&ap[0][i]) * x05 + + _mm512_loadu_pd(&ap[1][i]) * x15 + + _mm512_loadu_pd(&ap[2][i]) * x25 + + _mm512_loadu_pd(&ap[3][i]) * x35; + + tempY = _mm512_loadu_pd(&y[i]); + tempY += sum * __alpha5; + _mm512_storeu_pd(&y[i], tempY); + } +#endif + + for (; i < n; i+= 4) { + __m256d tempY; + __m256d sum; + + sum = _mm256_loadu_pd(&ap[0][i]) * x0 + + _mm256_loadu_pd(&ap[1][i]) * x1 + + _mm256_loadu_pd(&ap[2][i]) * x2 + + _mm256_loadu_pd(&ap[3][i]) * x3; + + tempY = _mm256_loadu_pd(&y[i]); + tempY += sum * __alpha; + _mm256_storeu_pd(&y[i], tempY); + } + +} + + +#define HAVE_KERNEL_4x2 + +static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + int i = 0; + + __m256d x0, x1; + __m256d __alpha; + + x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0])); + x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1])); + + __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); + + + for (i = 0; i < n; i+= 4) { + __m256d tempY; + __m256d sum; + + sum = _mm256_loadu_pd(&ap[0][i]) * x0 + _mm256_loadu_pd(&ap[1][i]) * x1; + + tempY = _mm256_loadu_pd(&y[i]); + tempY += sum * __alpha; + _mm256_storeu_pd(&y[i], tempY); + } + +} + +#else +#include "dgemv_n_microk_haswell-4.c" +#endif diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index 2c7b3b17c..ef9a0a6ba 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dscal_microk_bulldozer-2.c" #elif defined(SANDYBRIDGE) #include "dscal_microk_sandy-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "dscal_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "dscal_microk_skylakex-2.c" #endif diff --git a/kernel/x86_64/dscal_microk_skylakex-2.c b/kernel/x86_64/dscal_microk_skylakex-2.c new file mode 100644 index 000000000..e0598272e --- /dev/null +++ b/kernel/x86_64/dscal_microk_skylakex-2.c @@ -0,0 +1,77 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#include + +#define HAVE_KERNEL_8 1 + +static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + int i = 0; + +#ifdef __AVX512CD__ + __m512d __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha)); + for (; i < n; i += 8) { + _mm512_storeu_pd(&x[i + 0], __alpha5 * _mm512_loadu_pd(&x[i + 0])); + } +#else + __m256d __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); + for (; i < n; i += 8) { + _mm256_storeu_pd(&x[i + 0], __alpha * _mm256_loadu_pd(&x[i + 0])); + _mm256_storeu_pd(&x[i + 4], __alpha * _mm256_loadu_pd(&x[i + 4])); + } +#endif +} + + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + int i = 0; + + /* question to self: Why is this not just memset() */ + +#ifdef __AVX512CD__ + __m512d zero = _mm512_setzero_pd(); + for (; i < n; i += 8) { + _mm512_storeu_pd(&x[i], zero); + } +#else + __m256d zero = _mm256_setzero_pd(); + for (; i < n; i += 8) { + _mm256_storeu_pd(&x[i + 0], zero); + _mm256_storeu_pd(&x[i + 4], zero); + } +#endif + +} + +#else +#include "dscal_microk_haswell-2.c" +#endif diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c index 73099462c..a722cc9df 100644 --- a/kernel/x86_64/dsymv_L.c +++ b/kernel/x86_64/dsymv_L.c @@ -30,8 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_L_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "dsymv_L_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "dsymv_L_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_L_microk_sandy-2.c" #elif defined(NEHALEM) diff --git a/kernel/x86_64/dsymv_L_microk_skylakex-2.c b/kernel/x86_64/dsymv_L_microk_skylakex-2.c new file mode 100644 index 000000000..8244dffa1 --- /dev/null +++ b/kernel/x86_64/dsymv_L_microk_skylakex-2.c @@ -0,0 +1,161 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#include + +#define HAVE_KERNEL_4x4 1 + +static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + + __m256d accum_0, accum_1, accum_2, accum_3; + __m256d temp1_0, temp1_1, temp1_2, temp1_3; + + /* the 256 bit wide acculmulator vectors start out as zero */ + accum_0 = _mm256_setzero_pd(); + accum_1 = _mm256_setzero_pd(); + accum_2 = _mm256_setzero_pd(); + accum_3 = _mm256_setzero_pd(); + + temp1_0 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[0])); + temp1_1 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[1])); + temp1_2 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[2])); + temp1_3 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[3])); + +#ifdef __AVX512CD__ + __m512d accum_05, accum_15, accum_25, accum_35; + __m512d temp1_05, temp1_15, temp1_25, temp1_35; + BLASLONG to2; + int delta; + + /* the 512 bit wide accumulator vectors start out as zero */ + accum_05 = _mm512_setzero_pd(); + accum_15 = _mm512_setzero_pd(); + accum_25 = _mm512_setzero_pd(); + accum_35 = _mm512_setzero_pd(); + + temp1_05 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[0])); + temp1_15 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[1])); + temp1_25 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[2])); + temp1_35 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[3])); + + delta = (to - from) & ~7; + to2 = from + delta; + + + for (; from < to2; from += 8) { + __m512d _x, _y; + __m512d a0, a1, a2, a3; + + _y = _mm512_loadu_pd(&y[from]); + _x = _mm512_loadu_pd(&x[from]); + + a0 = _mm512_loadu_pd(&a[0][from]); + a1 = _mm512_loadu_pd(&a[1][from]); + a2 = _mm512_loadu_pd(&a[2][from]); + a3 = _mm512_loadu_pd(&a[3][from]); + + _y += temp1_05 * a0 + temp1_15 * a1 + temp1_25 * a2 + temp1_35 * a3; + + accum_05 += _x * a0; + accum_15 += _x * a1; + accum_25 += _x * a2; + accum_35 += _x * a3; + + _mm512_storeu_pd(&y[from], _y); + + }; + + /* + * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code + * below can continue using the intermediate results in its loop + */ + accum_0 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_05, 0), _mm512_extractf64x4_pd(accum_05, 1)); + accum_1 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_15, 0), _mm512_extractf64x4_pd(accum_15, 1)); + accum_2 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_25, 0), _mm512_extractf64x4_pd(accum_25, 1)); + accum_3 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_35, 0), _mm512_extractf64x4_pd(accum_35, 1)); + +#endif + + for (; from != to; from += 4) { + __m256d _x, _y; + __m256d a0, a1, a2, a3; + + _y = _mm256_loadu_pd(&y[from]); + _x = _mm256_loadu_pd(&x[from]); + + /* load 4 rows of matrix data */ + a0 = _mm256_loadu_pd(&a[0][from]); + a1 = _mm256_loadu_pd(&a[1][from]); + a2 = _mm256_loadu_pd(&a[2][from]); + a3 = _mm256_loadu_pd(&a[3][from]); + + _y += temp1_0 * a0 + temp1_1 * a1 + temp1_2 * a2 + temp1_3 * a3; + + accum_0 += _x * a0; + accum_1 += _x * a1; + accum_2 += _x * a2; + accum_3 += _x * a3; + + _mm256_storeu_pd(&y[from], _y); + + }; + + /* + * we now have 4 accumulator vectors. Each vector needs to be summed up element wise and stored in the temp2 + * output array. There is no direct instruction for this in 256 bit space, only in 128 space. + */ + + __m128d half_accum0, half_accum1, half_accum2, half_accum3; + + + /* Add upper half to lower half of each of the four 256 bit vectors to get to four 128 bit vectors */ + half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); + half_accum1 = _mm_add_pd(_mm256_extractf128_pd(accum_1, 0), _mm256_extractf128_pd(accum_1, 1)); + half_accum2 = _mm_add_pd(_mm256_extractf128_pd(accum_2, 0), _mm256_extractf128_pd(accum_2, 1)); + half_accum3 = _mm_add_pd(_mm256_extractf128_pd(accum_3, 0), _mm256_extractf128_pd(accum_3, 1)); + + /* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */ + half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); + half_accum1 = _mm_hadd_pd(half_accum1, half_accum1); + half_accum2 = _mm_hadd_pd(half_accum2, half_accum2); + half_accum3 = _mm_hadd_pd(half_accum3, half_accum3); + + /* and store the lowest double value from each of these vectors in the temp2 output */ + temp2[0] += half_accum0[0]; + temp2[1] += half_accum1[0]; + temp2[2] += half_accum2[0]; + temp2[3] += half_accum3[0]; +} +#else +#include "dsymv_L_microk_haswell-2.c" +#endif \ No newline at end of file diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index d89c4070d..e1349da58 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "saxpy_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "saxpy_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "saxpy_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "saxpy_microk_sandy-2.c" #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) diff --git a/kernel/x86_64/saxpy_microk_skylakex-2.c b/kernel/x86_64/saxpy_microk_skylakex-2.c new file mode 100644 index 000000000..950f10ba2 --- /dev/null +++ b/kernel/x86_64/saxpy_microk_skylakex-2.c @@ -0,0 +1,69 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_KERNEL_16 1 + +#include + +static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i = 0; + + __m256 __alpha; + + __alpha = _mm256_broadcastss_ps(_mm_load_ss(alpha)); + +#ifdef __AVX512CD__ + BLASLONG n64; + __m512 __alpha5; + __alpha5 = _mm512_broadcastss_ps(_mm_load_ss(alpha)); + + n64 = n & ~63; + + for (; i < n64; i+= 64) { + _mm512_storeu_ps(&y[i + 0], _mm512_loadu_ps(&y[i + 0]) + __alpha5 * _mm512_loadu_ps(&x[i + 0])); + _mm512_storeu_ps(&y[i + 16], _mm512_loadu_ps(&y[i + 16]) + __alpha5 * _mm512_loadu_ps(&x[i + 16])); + _mm512_storeu_ps(&y[i + 32], _mm512_loadu_ps(&y[i + 32]) + __alpha5 * _mm512_loadu_ps(&x[i + 32])); + _mm512_storeu_ps(&y[i + 48], _mm512_loadu_ps(&y[i + 48]) + __alpha5 * _mm512_loadu_ps(&x[i + 48])); + } + +#endif + + for (; i < n; i+= 32) { + _mm256_storeu_ps(&y[i + 0], _mm256_loadu_ps(&y[i + 0]) + __alpha * _mm256_loadu_ps(&x[i + 0])); + _mm256_storeu_ps(&y[i + 8], _mm256_loadu_ps(&y[i + 8]) + __alpha * _mm256_loadu_ps(&x[i + 8])); + _mm256_storeu_ps(&y[i + 16], _mm256_loadu_ps(&y[i + 16]) + __alpha * _mm256_loadu_ps(&x[i + 16])); + _mm256_storeu_ps(&y[i + 24], _mm256_loadu_ps(&y[i + 24]) + __alpha * _mm256_loadu_ps(&x[i + 24])); + } +} +#else +#include "saxpy_microk_haswell-2.c" +#endif + diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index c3ab2ffe6..3536afc9e 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -34,8 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sdot_microk_steamroller-2.c" #elif defined(NEHALEM) #include "sdot_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "sdot_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "sdot_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "sdot_microk_sandy-2.c" #endif diff --git a/kernel/x86_64/sdot_microk_skylakex-2.c b/kernel/x86_64/sdot_microk_skylakex-2.c new file mode 100644 index 000000000..1fcb7f27c --- /dev/null +++ b/kernel/x86_64/sdot_microk_skylakex-2.c @@ -0,0 +1,98 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_KERNEL_16 1 + +#include + +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + +{ + int i = 0; + __m256 accum_0, accum_1, accum_2, accum_3; + + accum_0 = _mm256_setzero_ps(); + accum_1 = _mm256_setzero_ps(); + accum_2 = _mm256_setzero_ps(); + accum_3 = _mm256_setzero_ps(); + +#ifdef __AVX512CD__ + __m512 accum_05, accum_15, accum_25, accum_35; + int n64; + n64 = n & (~63); + + accum_05 = _mm512_setzero_ps(); + accum_15 = _mm512_setzero_ps(); + accum_25 = _mm512_setzero_ps(); + accum_35 = _mm512_setzero_ps(); + + for (; i < n64; i += 64) { + accum_05 += _mm512_loadu_ps(&x[i+ 0]) * _mm512_loadu_ps(&y[i+ 0]); + accum_15 += _mm512_loadu_ps(&x[i+16]) * _mm512_loadu_ps(&y[i+16]); + accum_25 += _mm512_loadu_ps(&x[i+32]) * _mm512_loadu_ps(&y[i+32]); + accum_35 += _mm512_loadu_ps(&x[i+48]) * _mm512_loadu_ps(&y[i+48]); + } + + /* + * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code + * below can continue using the intermediate results in its loop + */ + accum_0 = _mm512_extractf32x8_ps(accum_05, 0) + _mm512_extractf32x8_ps(accum_05, 1); + accum_1 = _mm512_extractf32x8_ps(accum_15, 0) + _mm512_extractf32x8_ps(accum_15, 1); + accum_2 = _mm512_extractf32x8_ps(accum_25, 0) + _mm512_extractf32x8_ps(accum_25, 1); + accum_3 = _mm512_extractf32x8_ps(accum_35, 0) + _mm512_extractf32x8_ps(accum_35, 1); + +#endif + for (; i < n; i += 32) { + accum_0 += _mm256_loadu_ps(&x[i+ 0]) * _mm256_loadu_ps(&y[i+ 0]); + accum_1 += _mm256_loadu_ps(&x[i+ 8]) * _mm256_loadu_ps(&y[i+ 8]); + accum_2 += _mm256_loadu_ps(&x[i+16]) * _mm256_loadu_ps(&y[i+16]); + accum_3 += _mm256_loadu_ps(&x[i+24]) * _mm256_loadu_ps(&y[i+24]); + } + + /* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */ + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + + __m128 half_accum0; + + /* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */ + half_accum0 = _mm256_extractf128_ps(accum_0, 0) + _mm256_extractf128_ps(accum_0, 1); + + /* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */ + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + + *dot = half_accum0[0]; +} + +#else +#include "sdot_microk_haswell-2.c" +#endif diff --git a/lapack-netlib/SRC/chetrd_hb2st.F b/lapack-netlib/SRC/chetrd_hb2st.F index 91806bb1d..43da45640 100644 --- a/lapack-netlib/SRC/chetrd_hb2st.F +++ b/lapack-netlib/SRC/chetrd_hb2st.F @@ -280,8 +280,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -297,9 +297,9 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV( 18, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV( 19, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV( 20, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 ) + LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/chetrd_he2hb.f b/lapack-netlib/SRC/chetrd_he2hb.f index fd8c3fbe0..e334532fe 100644 --- a/lapack-netlib/SRC/chetrd_he2hb.f +++ b/lapack-netlib/SRC/chetrd_he2hb.f @@ -285,8 +285,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -296,7 +296,7 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV( 20, 'CHETRD_HE2HB', '', N, KD, -1, -1 ) + LWMIN = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/dsytrd_sb2st.F b/lapack-netlib/SRC/dsytrd_sb2st.F index 4ca0507e4..4d81fe226 100644 --- a/lapack-netlib/SRC/dsytrd_sb2st.F +++ b/lapack-netlib/SRC/dsytrd_sb2st.F @@ -277,8 +277,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -294,9 +294,9 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV( 18, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV( 19, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV( 20, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 ) + LHMIN = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/dsytrd_sy2sb.f b/lapack-netlib/SRC/dsytrd_sy2sb.f index 85337f792..e0a5debc5 100644 --- a/lapack-netlib/SRC/dsytrd_sy2sb.f +++ b/lapack-netlib/SRC/dsytrd_sy2sb.f @@ -285,8 +285,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -296,7 +296,7 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV( 20, 'DSYTRD_SY2SB', '', N, KD, -1, -1 ) + LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', '', N, KD, -1, -1 ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/ssytrd_sb2st.F b/lapack-netlib/SRC/ssytrd_sb2st.F index bd645327e..0df1173e4 100644 --- a/lapack-netlib/SRC/ssytrd_sb2st.F +++ b/lapack-netlib/SRC/ssytrd_sb2st.F @@ -277,8 +277,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -294,9 +294,9 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV( 18, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV( 19, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV( 20, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 ) + LHMIN = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/ssytrd_sy2sb.f b/lapack-netlib/SRC/ssytrd_sy2sb.f index c01fe3598..272876700 100644 --- a/lapack-netlib/SRC/ssytrd_sy2sb.f +++ b/lapack-netlib/SRC/ssytrd_sy2sb.f @@ -285,8 +285,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -296,7 +296,7 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV( 20, 'SSYTRD_SY2SB', '', N, KD, -1, -1 ) + LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/zhetrd_hb2st.F b/lapack-netlib/SRC/zhetrd_hb2st.F index 508afca06..86122cccc 100644 --- a/lapack-netlib/SRC/zhetrd_hb2st.F +++ b/lapack-netlib/SRC/zhetrd_hb2st.F @@ -280,8 +280,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -297,9 +297,9 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV( 18, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV( 19, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV( 20, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 ) + LHMIN = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/zhetrd_he2hb.f b/lapack-netlib/SRC/zhetrd_he2hb.f index e35578b42..e33bf4b2b 100644 --- a/lapack-netlib/SRC/zhetrd_he2hb.f +++ b/lapack-netlib/SRC/zhetrd_he2hb.f @@ -285,8 +285,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -296,7 +296,7 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV( 20, 'ZHETRD_HE2HB', '', N, KD, -1, -1 ) + LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1