diff --git a/Makefile b/Makefile index 56b4426f8..49dab6484 100644 --- a/Makefile +++ b/Makefile @@ -21,6 +21,17 @@ ifeq ($(BUILD_RELAPACK), 1) RELA = re_lapack endif +ifeq ($(NO_FORTRAN), 1) +define NOFORTRAN +1 +endef +define NO_LAPACK +1 +endef +export NOFORTRAN +export NO_LAPACK +endif + LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench @@ -47,7 +58,7 @@ endif endif @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" endif ifneq ($(OSNAME), AIX) @@ -108,7 +119,7 @@ endif endif tests : -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) touch $(LIBNAME) ifndef NO_FBLAS $(MAKE) -C test all @@ -210,7 +221,7 @@ netlib : else netlib : lapack_prebuild -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib endif @@ -231,7 +242,10 @@ prof_lapack : lapack_prebuild @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof lapack_prebuild : -ifndef NOFORTRAN + $(info filter value of NOFORTRAN is:) + $(info x$(filter-out $(NOFORTRAN), 1 2)x) + +ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc @@ -274,21 +288,21 @@ endif endif large.tgz : -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/large.tgz; fi endif timing.tgz : -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/timing.tgz; fi endif lapack-timing : large.tgz timing.tgz -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING diff --git a/Makefile.rule b/Makefile.rule index 5c03d0195..649aabe70 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -60,6 +60,14 @@ VERSION = 0.3.1.dev # This flag is always set for POWER8. Don't modify the flag # USE_OPENMP = 1 +# The OpenMP scheduler to use - by default this is "static" and you +# will normally not want to change this unless you know that your main +# workload will involve tasks that have highly unbalanced running times +# for individual threads. Changing away from "static" may also adversely +# affect memory access locality in NUMA systems. Setting to "runtime" will +# allow you to select the scheduler from the environment variable OMP_SCHEDULE +# CCOMMON_OPT += -DOMP_SCHED=dynamic + # You can define maximum number of threads. Basically it should be # less than actual number of cores. If you don't specify one, it's # automatically detected by the the script. @@ -156,8 +164,11 @@ NO_AFFINITY = 1 # CONSISTENT_FPCSR = 1 # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute -# with single thread. You can use this flag to avoid the overhead of multi-threading -# in small matrix sizes. The default value is 4. +# with single thread. (Actually in recent versions this is a factor proportional to the +# number of floating point operations necessary for the given problem size, no longer +# an individual dimension). You can use this setting to avoid the overhead of multi- +# threading in small matrix sizes. The default value is 4, but values as high as 50 have +# been reported to be optimal for certain workloads (50 is the recommended value for Julia). # GEMM_MULTITHREAD_THRESHOLD = 4 # If you need santy check by comparing reference BLAS. It'll be very diff --git a/Makefile.system b/Makefile.system index 62ba0e466..5dffd8d2e 100644 --- a/Makefile.system +++ b/Makefile.system @@ -248,7 +248,7 @@ endif ifeq ($(OSNAME), Darwin) ifndef MACOSX_DEPLOYMENT_TARGET -export MACOSX_DEPLOYMENT_TARGET=10.6 +export MACOSX_DEPLOYMENT_TARGET=10.8 endif MD5SUM = md5 -r endif diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 1ba63278a..677c05d93 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -8,6 +8,13 @@ endif endif endif +ifeq ($(CORE), SKYLAKEX) +ifndef NO_AVX512 +CCOMMON_OPT += -march=skylake-avx512 +FCOMMON_OPT += -march=skylake-avx512 +endif +endif + ifeq ($(OSNAME), Interix) ARFLAGS = -m x64 endif diff --git a/cblas.h b/cblas.h index 89f78c133..6461f4209 100644 --- a/cblas.h +++ b/cblas.h @@ -82,6 +82,11 @@ CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); + void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); diff --git a/common_stackalloc.h b/common_stackalloc.h index 71fb1a477..ec0fa1611 100644 --- a/common_stackalloc.h +++ b/common_stackalloc.h @@ -47,14 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - large enough to support all architectures and kernel * Chosing a too small SIZE will lead to a stack smashing. */ -#define STACK_ALLOC(SIZE, TYPE, BUFFER) \ - /* make it volatile because some function (ex: dgemv_n.S) */ \ - /* do not restore all register */ \ - volatile int stack_alloc_size = SIZE; \ - if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \ - stack_alloc_size = 0; \ - STACK_ALLOC_PROTECT_SET \ - TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \ +#define STACK_ALLOC(SIZE, TYPE, BUFFER) \ + /* make it volatile because some function (ex: dgemv_n.S) */ \ + /* do not restore all register */ \ + volatile int stack_alloc_size = SIZE; \ + if (stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) stack_alloc_size = 0; \ + STACK_ALLOC_PROTECT_SET \ + /* Avoid declaring an array of length 0 */ \ + TYPE stack_buffer[stack_alloc_size ? stack_alloc_size : 1] \ + __attribute__((aligned(0x20))); \ BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1); #else //Original OpenBLAS/GotoBLAS codes. diff --git a/common_x86_64.h b/common_x86_64.h index 7461aaf60..62e138e34 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -60,8 +60,13 @@ #endif */ -#define MB -#define WMB +#ifdef __GNUC__ +#define MB do { __asm__ __volatile__("": : :"memory"); } while (0) +#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0) +#else +#define MB do {} while (0) +#define WMB do {} while (0) +#endif static void __inline blas_lock(volatile BLASULONG *address){ diff --git a/cpuid_x86.c b/cpuid_x86.c index fc937865c..89eb809b0 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1339,6 +1339,23 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; + case 6: + switch (model) { + case 6: // Cannon Lake +#ifndef NO_AVX512 + return CPUTYPE_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_HASWELL; +#else + return CPUTYPE_SANDYBRIDGE; +#endif + else + return CPUTYPE_NEHALEM; +#endif + } + break; case 9: case 8: switch (model) { diff --git a/ctest/Makefile b/ctest/Makefile index 6eda43863..569a5dda3 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -102,7 +102,13 @@ clean :: rm -f x* FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) -CEXTRALIB = +ifeq ($(USE_OPENMP), 1) +ifeq ($(F_COMPILER), GFORTRAN) +ifeq ($(C_COMPILER), CLANG) +CEXTRALIB = -lomp +endif +endif +endif # Single real xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 4ab1ee8cc..ee3e3b9a9 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -91,11 +91,7 @@ #endif typedef struct { -#if __STDC_VERSION__ >= 201112L -_Atomic -#else volatile -#endif BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; @@ -348,12 +344,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) { - /* Make sure if no one is using workspace */ - START_RPCC(); - for (i = 0; i < args -> nthreads; i++) - while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; - STOP_RPCC(waiting1); - #if defined(FUSED_GEMM) && !defined(TIMING) /* Fused operation to copy region of B into workspace and apply kernel */ @@ -391,10 +381,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } #endif - /* Set flag so other threads can access local region of B */ - for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) + for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) { + /* Make sure if no one is using workspace */ + START_RPCC(); + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; + STOP_RPCC(waiting1); + /* Set flag so other threads can access local region of B */ job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; - WMB; + WMB; + } } /* Get regions of B from other threads and apply kernel */ @@ -413,7 +408,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Wait until other region of B is initialized */ START_RPCC(); - while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;}; STOP_RPCC(waiting2); /* Apply kernel with local region of A and part of other region of B */ @@ -430,12 +425,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Clear synchronization flag if this thread is done with other region of B */ if (m_to - m_from == min_i) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + WMB; } } } while (current != mypos); - /* Iterate through steps of m + /* Iterate through steps of m * Note: First step has already been finished */ for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; @@ -465,14 +461,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, is, js); STOP_RPCC(kernel); - + #ifdef TIMING ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l; #endif - + /* Clear synchronization flag if this thread is done with region of B */ if (is + min_i >= m_to) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; WMB; } } @@ -492,7 +488,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, START_RPCC(); for (i = 0; i < args -> nthreads; i++) { for (js = 0; js < DIVIDE_RATE; js++) { - while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;}; + while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;}; } } STOP_RPCC(waiting3); @@ -658,8 +654,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG } /* Clear synchronization flags */ - for (i = 0; i < MAX_CPU_NUMBER; i++) { - for (j = 0; j < MAX_CPU_NUMBER; j++) { + for (i = 0; i < nthreads; i++) { + for (j = 0; j < nthreads; j++) { for (k = 0; k < DIVIDE_RATE; k++) { job[i].working[j][CACHE_LINE_SIZE * k] = 0; } diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index fccdb4320..4255852c8 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -48,6 +48,10 @@ #else +#ifndef OMP_SCHED +#define OMP_SCHED static +#endif + int blas_server_avail = 0; static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; @@ -331,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ break; } -#pragma omp parallel for schedule(static) +#pragma omp parallel for schedule(OMP_SCHED) for (i = 0; i < num; i ++) { #ifndef USE_SIMPLE_THREADED_LEVEL3 diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 4271c0a0d..bacd3b7fa 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -338,6 +338,23 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; } return NULL; + case 6: + if (model == 6) { + // Cannon Lake +#ifndef NO_AVX512 + return &gotoblas_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return &gotoblas_HASWELL; +#else + return &gotblas_SANDYBRIDGE; +#endif + else + return &gotoblas_NEHALEM; +#endif + } + return NULL; case 9: case 8: if (model == 14 ) { // Kaby Lake diff --git a/driver/others/memory.c b/driver/others/memory.c index d69e52e97..7eff16ce3 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -13,9 +13,9 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the OpenBLAS project nor the names of - its contributors may be used to endorse or promote products - derived from this software without specific prior written + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" @@ -139,6 +139,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FIXED_PAGESIZE 4096 #endif +#ifndef BUFFERS_PER_THREAD +#ifdef USE_OPENMP +#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) +#else +#define BUFFERS_PER_THREAD NUM_BUFFERS +#endif +#endif + #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) #if defined(_MSC_VER) && !defined(__clang__) @@ -213,7 +221,7 @@ int i,n; ret = sched_getaffinity(0,size,cpusetp); if (ret!=0) return nums; ret = CPU_COUNT_S(size,cpusetp); - if (ret > 0 && ret < nums) nums = ret; + if (ret > 0 && ret < nums) nums = ret; CPU_FREE(cpusetp); return nums; #endif @@ -318,6 +326,8 @@ int goto_get_num_procs (void) { return blas_cpu_number; } +static void blas_memory_init(); + void openblas_fork_handler() { // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is @@ -329,7 +339,7 @@ void openblas_fork_handler() // implementation of OpenMP. #if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER) int err; - err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL); + err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, blas_memory_init); if(err != 0) openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n"); #endif @@ -407,16 +417,104 @@ int openblas_get_num_threads(void) { #endif } -struct release_t { - void *address; - void (*func)(struct release_t *); - long attr; -}; - int hugetlb_allocated = 0; -static struct release_t release_info[NUM_BUFFERS]; -static int release_pos = 0; +#if defined(OS_WINDOWS) +#define THREAD_LOCAL __declspec(thread) +#define LIKELY_ONE(x) (x) +#else +#define THREAD_LOCAL __thread +#define LIKELY_ONE(x) (__builtin_expect(x, 1)) +#endif + +/* Stores information about the allocation and how to release it */ +struct alloc_t { + /* Whether this allocation is being used */ + int used; + /* Any special attributes needed when releasing this allocation */ + int attr; + /* Function that can properly release this memory */ + void (*release_func)(struct alloc_t *); + /* Pad to 64-byte alignment */ + char pad[64 - 2 * sizeof(int) - sizeof(void(*))]; +}; + +/* Convenience macros for storing release funcs */ +#define STORE_RELEASE_FUNC(address, func) \ + if (address != (void *)-1) { \ + struct alloc_t *alloc_info = (struct alloc_t *)address; \ + alloc_info->release_func = func; \ + } + +#define STORE_RELEASE_FUNC_WITH_ATTR(address, func, attr) \ + if (address != (void *)-1) { \ + struct alloc_t *alloc_info = (struct alloc_t *)address; \ + alloc_info->release_func = func; \ + alloc_info->attr = attr; \ + } + +/* The number of bytes that will be allocated for each buffer. When allocating + memory, we store an alloc_t followed by the actual buffer memory. This means + that each allocation always has its associated alloc_t, without the need + for an auxiliary tracking structure. */ +static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t); + +/* Clang supports TLS from version 2.8 */ +#if defined(__clang__) && __clang_major__ > 2 || \ + (__clang_minor__ == 2 || __clang_minor__ == 8) +#define HAS_COMPILER_TLS +#endif + +/* GCC supports TLS from version 4.1 */ +#if !defined(__clang__) && defined(__GNUC__) && \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) +#define HAS_COMPILER_TLS +#endif + +/* MSVC supports TLS from version 2005 */ +#if defined(_MSC_VER) && _MSC_VER >= 1400 +#define HAS_COMPILER_TLS +#endif + +/* Versions of XCode before 8 did not properly support TLS */ +#if defined(__apple_build_version__) && __apple_build_version__ < 8000042 +#undef HAS_COMPILER_TLS +#endif + +/* Android NDK's before version 12b did not support TLS */ +#if defined(__ANDROID__) && defined(__clang__) +#if __has_include() +#include +#endif +#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \ + defined(__NDK_MINOR__) && \ + ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1))) +#undef HAS_COMPILER_TLS +#endif +#endif + +/* Holds pointers to allocated memory */ +#if defined(SMP) && !defined(USE_OPENMP) +/* This is the number of threads than can be spawned by the server, which is the + server plus the number of threads in the thread pool */ +# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +static int next_memory_table_pos = 0; +# if defined(HAS_COMPILER_TLS) +/* Use compiler generated thread-local-storage */ +static int THREAD_LOCAL local_memory_table_pos = 0; +# else +/* Use system-dependent thread-local-storage */ +# if defined(OS_WINDOWS) +static DWORD local_storage_key; +# else +static pthread_key_t local_storage_key; +# endif /* defined(OS_WINDOWS) */ +# endif /* defined(HAS_COMPILER_TLS) */ +#else +/* There is only one allocating thread when in single-threaded mode and when using OpenMP */ +# define MAX_ALLOCATING_THREADS 1 +#endif /* defined(SMP) && !defined(USE_OPENMP) */ +static struct alloc_t * local_memory_table[MAX_ALLOCATING_THREADS][BUFFERS_PER_THREAD]; #if defined(OS_LINUX) && !defined(NO_WARMUP) static int hot_alloc = 0; @@ -432,11 +530,41 @@ static pthread_spinlock_t alloc_lock = 0; static BLASULONG alloc_lock = 0UL; #endif +/* Returns a pointer to the start of the per-thread memory allocation data */ +static __inline struct alloc_t ** get_memory_table() { +#if defined(SMP) && !defined(USE_OPENMP) +# if !defined(HAS_COMPILER_TLS) +# if defined(OS_WINDOWS) + int local_memory_table_pos = (int)::TlsGetValue(local_storage_key); +# else + int local_memory_table_pos = (int)pthread_getspecific(local_storage_key); +# endif /* defined(OS_WINDOWS) */ +# endif /* !defined(HAS_COMPILER_TLS) */ + if (!local_memory_table_pos) { + LOCK_COMMAND(&alloc_lock); + local_memory_table_pos = next_memory_table_pos++; + if (next_memory_table_pos > MAX_ALLOCATING_THREADS) + printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n"); + UNLOCK_COMMAND(&alloc_lock); +# if !defined(HAS_COMPILER_TLS) +# if defined(OS_WINDOWS) + ::TlsSetValue(local_storage_key, (void*)local_memory_table_pos); +# else + pthread_setspecific(local_storage_key, (void*)local_memory_table_pos); +# endif /* defined(OS_WINDOWS) */ +# endif /* !defined(HAS_COMPILER_TLS) */ + } + return local_memory_table[local_memory_table_pos]; +#else + return local_memory_table[0]; +#endif /* defined(SMP) && !defined(USE_OPENMP) */ +} + #ifdef ALLOC_MMAP -static void alloc_mmap_free(struct release_t *release){ +static void alloc_mmap_free(struct alloc_t *alloc_info){ - if (munmap(release -> address, BUFFER_SIZE)) { + if (munmap(alloc_info, allocation_block_size)) { printf("OpenBLAS : munmap failed\n"); } } @@ -450,28 +578,18 @@ static void *alloc_mmap(void *address){ if (address){ map_address = mmap(address, - BUFFER_SIZE, + allocation_block_size, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); } else { map_address = mmap(address, - BUFFER_SIZE, + allocation_block_size, MMAP_ACCESS, MMAP_POLICY, -1, 0); } - if (map_address != (void *)-1) { -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_mmap_free; - release_pos ++; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif - } + STORE_RELEASE_FUNC(map_address, alloc_mmap_free); #ifdef OS_LINUX - my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0); #endif return map_address; @@ -524,25 +642,25 @@ static void *alloc_mmap(void *address){ if (address){ /* Just give up use advanced operation */ - map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); + map_address = mmap(address, allocation_block_size, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); #ifdef OS_LINUX - my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0); #endif } else { #if defined(OS_LINUX) && !defined(NO_WARMUP) if (hot_alloc == 0) { - map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0); + map_address = mmap(NULL, allocation_block_size, MMAP_ACCESS, MMAP_POLICY, -1, 0); #ifdef OS_LINUX - my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0); #endif } else { #endif - map_address = mmap(NULL, BUFFER_SIZE * SCALING, + map_address = mmap(NULL, allocation_block_size * SCALING, MMAP_ACCESS, MMAP_POLICY, -1, 0); if (map_address != (void *)-1) { @@ -550,7 +668,7 @@ static void *alloc_mmap(void *address){ #ifdef OS_LINUX #ifdef DEBUG int ret=0; - ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); + ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0); if(ret==-1){ int errsv=errno; perror("OpenBLAS alloc_mmap:"); @@ -558,7 +676,7 @@ static void *alloc_mmap(void *address){ } #else - my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0); #endif #endif @@ -566,7 +684,7 @@ static void *alloc_mmap(void *address){ allocsize = DGEMM_P * DGEMM_Q * sizeof(double); start = (BLASULONG)map_address; - current = (SCALING - 1) * BUFFER_SIZE; + current = (SCALING - 1) * allocation_block_size; while(current > 0) { *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; @@ -581,7 +699,7 @@ static void *alloc_mmap(void *address){ best = (BLASULONG)-1; best_address = map_address; - while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) { + while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) { current = run_bench(start, allocsize); @@ -597,7 +715,7 @@ static void *alloc_mmap(void *address){ if ((BLASULONG)best_address > (BLASULONG)map_address) munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address); - munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address); + munmap((void *)((BLASULONG)best_address + allocation_block_size), (SCALING - 1) * allocation_block_size + (BLASULONG)map_address - (BLASULONG)best_address); map_address = best_address; @@ -610,17 +728,7 @@ static void *alloc_mmap(void *address){ } #endif - if (map_address != (void *)-1) { -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_mmap_free; - release_pos ++; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif - } + STORE_RELEASE_FUNC(map_address, alloc_mmap_free); return map_address; } @@ -632,9 +740,9 @@ static void *alloc_mmap(void *address){ #ifdef ALLOC_MALLOC -static void alloc_malloc_free(struct release_t *release){ +static void alloc_malloc_free(struct alloc_t *alloc_info){ - free(release -> address); + free(alloc_info); } @@ -642,15 +750,11 @@ static void *alloc_malloc(void *address){ void *map_address; - map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE); + map_address = (void *)malloc(allocation_block_size + FIXED_PAGESIZE); if (map_address == (void *)NULL) map_address = (void *)-1; - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_malloc_free; - release_pos ++; - } + STORE_RELEASE_FUNC(map_address, alloc_malloc_free); return map_address; @@ -667,24 +771,20 @@ void *qfree (void *address); #define QCOMMS 0x2 #define QFAST 0x4 -static void alloc_qalloc_free(struct release_t *release){ +static void alloc_qalloc_free(struct alloc_t *alloc_info){ - qfree(release -> address); + qfree(alloc_info); } static void *alloc_qalloc(void *address){ void *map_address; - map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE); + map_address = (void *)qalloc(QCOMMS | QFAST, allocation_block_size + FIXED_PAGESIZE); if (map_address == (void *)NULL) map_address = (void *)-1; - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_qalloc_free; - release_pos ++; - } + STORE_RELEASE_FUNC(map_address, alloc_qalloc_free); return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1)); } @@ -693,9 +793,9 @@ static void *alloc_qalloc(void *address){ #ifdef ALLOC_WINDOWS -static void alloc_windows_free(struct release_t *release){ +static void alloc_windows_free(struct alloc_t *alloc_info){ - VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT); + VirtualFree(alloc_info, allocation_block_size, MEM_DECOMMIT); } @@ -703,17 +803,13 @@ static void *alloc_windows(void *address){ void *map_address; map_address = VirtualAlloc(address, - BUFFER_SIZE, + allocation_block_size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); if (map_address == (void *)NULL) map_address = (void *)-1; - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_windows_free; - release_pos ++; - } + STORE_RELEASE_FUNC(map_address, alloc_windows_free); return map_address; } @@ -725,13 +821,14 @@ static void *alloc_windows(void *address){ #define DEVICEDRIVER_NAME "/dev/mapper" #endif -static void alloc_devicedirver_free(struct release_t *release){ +static void alloc_devicedirver_free(struct alloc_t *alloc_info){ - if (munmap(release -> address, BUFFER_SIZE)) { + int attr = alloc_info -> attr; + if (munmap(address, allocation_block_size)) { printf("OpenBLAS : Bugphysarea unmap failed.\n"); } - if (close(release -> attr)) { + if (close(attr)) { printf("OpenBLAS : Bugphysarea close failed.\n"); } @@ -748,17 +845,12 @@ static void *alloc_devicedirver(void *address){ } - map_address = mmap(address, BUFFER_SIZE, + map_address = mmap(address, allocation_block_size, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 0); - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].attr = fd; - release_info[release_pos].func = alloc_devicedirver_free; - release_pos ++; - } + STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_devicedirver_free, fd); return map_address; } @@ -767,9 +859,9 @@ static void *alloc_devicedirver(void *address){ #ifdef ALLOC_SHM -static void alloc_shm_free(struct release_t *release){ +static void alloc_shm_free(struct alloc_t *alloc_info){ - if (shmdt(release -> address)) { + if (shmdt(alloc_info)) { printf("OpenBLAS : Shared memory unmap failed.\n"); } } @@ -778,22 +870,21 @@ static void *alloc_shm(void *address){ void *map_address; int shmid; - shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600); + shmid = shmget(IPC_PRIVATE, allocation_block_size,IPC_CREAT | 0600); map_address = (void *)shmat(shmid, address, 0); if (map_address != (void *)-1){ #ifdef OS_LINUX - my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0); #endif shmctl(shmid, IPC_RMID, 0); - release_info[release_pos].address = map_address; - release_info[release_pos].attr = shmid; - release_info[release_pos].func = alloc_shm_free; - release_pos ++; + struct alloc_t *alloc_info = (struct alloc_t *)map_address; + alloc_info->release_func = alloc_shm_free; + alloc_info->attr = shmid; } return map_address; @@ -801,23 +892,23 @@ static void *alloc_shm(void *address){ #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS -static void alloc_hugetlb_free(struct release_t *release){ +static void alloc_hugetlb_free(struct alloc_t *alloc_info){ #if defined(OS_LINUX) || defined(OS_AIX) - if (shmdt(release -> address)) { + if (shmdt(alloc_info)) { printf("OpenBLAS : Hugepage unmap failed.\n"); } #endif #ifdef __sun__ - munmap(release -> address, BUFFER_SIZE); + munmap(alloc_info, allocation_block_size); #endif #ifdef OS_WINDOWS - VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT); + VirtualFree(alloc_info, allocation_block_size, MEM_LARGE_PAGES | MEM_DECOMMIT); #endif @@ -830,7 +921,7 @@ static void *alloc_hugetlb(void *address){ #if defined(OS_LINUX) || defined(OS_AIX) int shmid; - shmid = shmget(IPC_PRIVATE, BUFFER_SIZE, + shmid = shmget(IPC_PRIVATE, allocation_block_size, #ifdef OS_LINUX SHM_HUGETLB | #endif @@ -843,7 +934,7 @@ static void *alloc_hugetlb(void *address){ map_address = (void *)shmat(shmid, address, SHM_RND); #ifdef OS_LINUX - my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0); #endif if (map_address != (void *)-1){ @@ -860,7 +951,7 @@ static void *alloc_hugetlb(void *address){ mha.mha_pagesize = HUGE_PAGESIZE; memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0); - map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE); + map_address = (BLASULONG)memalign(HUGE_PAGESIZE, allocation_block_size); #endif #ifdef OS_WINDOWS @@ -872,7 +963,7 @@ static void *alloc_hugetlb(void *address){ tp.PrivilegeCount = 1; tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; - + if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) { CloseHandle(hToken); return (void*)-1; @@ -884,7 +975,7 @@ static void *alloc_hugetlb(void *address){ } map_address = (void *)VirtualAlloc(address, - BUFFER_SIZE, + allocation_block_size, MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); @@ -895,11 +986,7 @@ static void *alloc_hugetlb(void *address){ #endif - if (map_address != (void *)-1){ - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_hugetlb_free; - release_pos ++; - } + STORE_RELEASE_FUNC(map_address, alloc_hugetlb_free); return map_address; } @@ -911,13 +998,14 @@ static void *alloc_hugetlb(void *address){ static int hugetlb_pid = 0; -static void alloc_hugetlbfile_free(struct release_t *release){ +static void alloc_hugetlbfile_free(struct alloc_t *alloc_info){ - if (munmap(release -> address, BUFFER_SIZE)) { + int attr = alloc_info -> attr; + if (munmap(alloc_info, allocation_block_size)) { printf("OpenBLAS : HugeTLBfs unmap failed.\n"); } - if (close(release -> attr)) { + if (close(attr)) { printf("OpenBLAS : HugeTLBfs close failed.\n"); } } @@ -938,17 +1026,12 @@ static void *alloc_hugetlbfile(void *address){ unlink(filename); - map_address = mmap(address, BUFFER_SIZE, + map_address = mmap(address, allocation_block_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].attr = fd; - release_info[release_pos].func = alloc_hugetlbfile_free; - release_pos ++; - } + STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_hugetlbfile_free, fd); return map_address; } @@ -961,35 +1044,35 @@ static BLASULONG base_address = 0UL; static BLASULONG base_address = BASE_ADDRESS; #endif -static volatile struct { - BLASULONG lock; - void *addr; -#if defined(WHEREAMI) && !defined(USE_OPENMP) - int pos; -#endif - int used; -#ifndef __64BIT__ - char dummy[48]; +#if __STDC_VERSION__ >= 201112L +static _Atomic int memory_initialized = 0; #else - char dummy[40]; +static volatile int memory_initialized = 0; #endif -} memory[NUM_BUFFERS]; - -static int memory_initialized = 0; - /* Memory allocation routine */ /* procpos ... indicates where it comes from */ /* 0 : Level 3 functions */ /* 1 : Level 2 functions */ /* 2 : Thread */ +static void blas_memory_init(){ +#if defined(SMP) && !defined(USE_OPENMP) + next_memory_table_pos = 0; +# if !defined(HAS_COMPILER_TLS) +# if defined(OS_WINDOWS) + local_storage_key = ::TlsAlloc(); +# else + pthread_key_create(&local_storage_key, NULL); +# endif /* defined(OS_WINDOWS) */ +# endif /* defined(HAS_COMPILER_TLS) */ +#endif /* defined(SMP) && !defined(USE_OPENMP) */ + memset(local_memory_table, 0, sizeof(local_memory_table)); +} + void *blas_memory_alloc(int procpos){ int position; -#if defined(WHEREAMI) && !defined(USE_OPENMP) - int mypos; -#endif void *map_address; @@ -1019,103 +1102,54 @@ void *blas_memory_alloc(int procpos){ NULL, }; void *(**func)(void *address); + struct alloc_t * alloc_info; + struct alloc_t ** alloc_table; -#if defined(USE_OPENMP) - if (!memory_initialized) { + if (!LIKELY_ONE(memory_initialized)) { +#if defined(SMP) && !defined(USE_OPENMP) + /* Only allow a single thread to initialize memory system */ + LOCK_COMMAND(&alloc_lock); + + if (!memory_initialized) { #endif - - LOCK_COMMAND(&alloc_lock); - - if (!memory_initialized) { - -#if defined(WHEREAMI) && !defined(USE_OPENMP) - for (position = 0; position < NUM_BUFFERS; position ++){ - memory[position].addr = (void *)0; - memory[position].pos = -1; - memory[position].used = 0; - memory[position].lock = 0; - } -#endif - + blas_memory_init(); #ifdef DYNAMIC_ARCH - gotoblas_dynamic_init(); + gotoblas_dynamic_init(); #endif #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) - gotoblas_affinity_init(); + gotoblas_affinity_init(); #endif #ifdef SMP - if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); + if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); #endif #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) #ifndef DYNAMIC_ARCH - blas_set_parameter(); + blas_set_parameter(); #endif #endif - memory_initialized = 1; + memory_initialized = 1; - } - UNLOCK_COMMAND(&alloc_lock); -#if defined(USE_OPENMP) - } +#if defined(SMP) && !defined(USE_OPENMP) + } + UNLOCK_COMMAND(&alloc_lock); #endif + } #ifdef DEBUG printf("Alloc Start ...\n"); -#endif - -#if defined(WHEREAMI) && !defined(USE_OPENMP) - - mypos = WhereAmI(); - - position = mypos; - while (position >= NUM_BUFFERS) position >>= 1; - - do { - if (!memory[position].used && (memory[position].pos == mypos)) { -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#else - blas_lock(&memory[position].lock); -#endif - if (!memory[position].used) goto allocation; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#else - blas_unlock(&memory[position].lock); -#endif - } - - position ++; - - } while (position < NUM_BUFFERS); - - #endif position = 0; - + alloc_table = get_memory_table(); do { -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#else - if (!memory[position].used) { - blas_lock(&memory[position].lock); -#endif - if (!memory[position].used) goto allocation; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#else - blas_unlock(&memory[position].lock); - } -#endif - + if (!alloc_table[position] || !alloc_table[position]->used) goto allocation; position ++; - } while (position < NUM_BUFFERS); + } while (position < BUFFERS_PER_THREAD); goto error; @@ -1125,14 +1159,8 @@ void *blas_memory_alloc(int procpos){ printf(" Position -> %d\n", position); #endif - memory[position].used = 1; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#else - blas_unlock(&memory[position].lock); -#endif - - if (!memory[position].addr) { + alloc_info = alloc_table[position]; + if (!alloc_info) { do { #ifdef DEBUG printf("Allocation Start : %lx\n", base_address); @@ -1144,18 +1172,18 @@ void *blas_memory_alloc(int procpos){ while ((func != NULL) && (map_address == (void *) -1)) { - map_address = (*func)((void *)base_address); + map_address = (*func)((void *)base_address); #ifdef ALLOC_DEVICEDRIVER if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { - fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n"); + fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n"); } #endif #ifdef ALLOC_HUGETLBFILE if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { #ifndef OS_WINDOWS - fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n"); + fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n"); #endif } #endif @@ -1172,89 +1200,44 @@ void *blas_memory_alloc(int procpos){ #endif if (((BLASLONG) map_address) == -1) base_address = 0UL; - if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE; + if (base_address) base_address += allocation_block_size + FIXED_PAGESIZE; } while ((BLASLONG)map_address == -1); -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif - memory[position].addr = map_address; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif + alloc_table[position] = alloc_info = map_address; #ifdef DEBUG - printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); + printf(" Mapping Succeeded. %p(%d)\n", (void *)alloc_info, position); #endif } -#if defined(WHEREAMI) && !defined(USE_OPENMP) - - if (memory[position].pos == -1) memory[position].pos = mypos; - -#endif - -#ifdef DYNAMIC_ARCH - - if (memory_initialized == 1) { - - LOCK_COMMAND(&alloc_lock); - - if (memory_initialized == 1) { - - if (!gotoblas) gotoblas_dynamic_init(); - - memory_initialized = 2; - } - - UNLOCK_COMMAND(&alloc_lock); - - } -#endif - - #ifdef DEBUG - printf("Mapped : %p %3d\n\n", - (void *)memory[position].addr, position); + printf("Mapped : %p %3d\n\n", (void *)alloc_info, position); #endif - return (void *)memory[position].addr; + alloc_info->used = 1; + + return (void *)(((char *)alloc_info) + sizeof(struct alloc_t)); error: - printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); + printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n"); return NULL; } -void blas_memory_free(void *free_area){ - +void blas_memory_free(void *buffer){ +#ifdef DEBUG int position; + struct alloc_t ** alloc_table; +#endif + /* Since we passed an offset pointer to the caller, get back to the actual allocation */ + struct alloc_t *alloc_info = (void *)(((char *)buffer) - sizeof(struct alloc_t)); #ifdef DEBUG - printf("Unmapped Start : %p ...\n", free_area); + printf("Unmapped Start : %p ...\n", alloc_info); #endif - position = 0; -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif - while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) - position++; - - if (memory[position].addr != free_area) goto error; - -#ifdef DEBUG - printf(" Position : %d\n", position); -#endif - - // arm: ensure all writes are finished before other thread takes this memory - WMB; - - memory[position].used = 0; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif + alloc_info->used = 0; #ifdef DEBUG printf("Unmap Succeeded.\n\n"); @@ -1262,15 +1245,13 @@ void blas_memory_free(void *free_area){ return; - error: - printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); - #ifdef DEBUG - for (position = 0; position < NUM_BUFFERS; position++) - printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); -#endif -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); + alloc_table = get_memory_table(); + for (position = 0; position < BUFFERS_PER_THREAD; position++){ + if (alloc_table[position]) { + printf("%4ld %p : %d\n", position, alloc_table[position], alloc_table[position]->used); + } + } #endif return; } @@ -1287,16 +1268,20 @@ void blas_memory_free_nolock(void * map_address) { void blas_shutdown(void){ - int pos; + int pos, thread; #ifdef SMP BLASFUNC(blas_thread_shutdown)(); #endif - LOCK_COMMAND(&alloc_lock); - - for (pos = 0; pos < release_pos; pos ++) { - release_info[pos].func(&release_info[pos]); + for (thread = 0; thread < MAX_ALLOCATING_THREADS; thread ++){ + for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){ + struct alloc_t *alloc_info = local_memory_table[thread][pos]; + if (alloc_info) { + alloc_info->release_func(alloc_info); + alloc_info = (void *)0; + } + } } #ifdef SEEK_ADDRESS @@ -1305,17 +1290,6 @@ void blas_shutdown(void){ base_address = BASE_ADDRESS; #endif - for (pos = 0; pos < NUM_BUFFERS; pos ++){ - memory[pos].addr = (void *)0; - memory[pos].used = 0; -#if defined(WHEREAMI) && !defined(USE_OPENMP) - memory[pos].pos = -1; -#endif - memory[pos].lock = 0; - } - - UNLOCK_COMMAND(&alloc_lock); - return; } @@ -1339,7 +1313,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, size_t size; BLASULONG buffer; - size = BUFFER_SIZE - PAGESIZE; + size = allocation_block_size - PAGESIZE; buffer = (BLASULONG)sa + GEMM_OFFSET_A; #if defined(OS_LINUX) && !defined(NO_WARMUP) @@ -1360,7 +1334,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, UNLOCK_COMMAND(&init_lock); #endif - size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE); + size = MIN((allocation_block_size - PAGESIZE), L2_SIZE); buffer = (BLASULONG)sa + GEMM_OFFSET_A; while (size > 0) { diff --git a/interface/Makefile b/interface/Makefile index 9b2b93b83..20ec74e9e 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -260,7 +260,7 @@ HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \ idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX) CSBLAS1OBJS = \ - cblas_isamax.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ + cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) @@ -277,7 +277,7 @@ CSBLAS3OBJS = \ cblas_sgeadd.$(SUFFIX) CDBLAS1OBJS = \ - cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ + cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) @@ -294,7 +294,7 @@ CDBLAS3OBJS += \ cblas_dgeadd.$(SUFFIX) CCBLAS1OBJS = \ - cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ + cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ cblas_ccopy.$(SUFFIX) \ cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \ cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ @@ -320,7 +320,7 @@ CCBLAS3OBJS = \ CZBLAS1OBJS = \ - cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ + cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ cblas_zcopy.$(SUFFIX) \ cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ @@ -1359,6 +1359,18 @@ cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) +cblas_isamin.$(SUFFIX) cblas_isamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_idamin.$(SUFFIX) cblas_idamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_icamin.$(SUFFIX) cblas_icamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_izamin.$(SUFFIX) cblas_izamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) diff --git a/param.h b/param.h index 49a5e85e8..cfa4bba5c 100644 --- a/param.h +++ b/param.h @@ -1507,7 +1507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 -#define SWITCH_RATIO 4 +#define SWITCH_RATIO 32 #ifdef ARCH_X86 @@ -1626,7 +1626,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 -#define SWITCH_RATIO 4 +#define SWITCH_RATIO 32 #ifdef ARCH_X86 diff --git a/test/Makefile b/test/Makefile index 65fb6f438..074411b05 100644 --- a/test/Makefile +++ b/test/Makefile @@ -122,8 +122,13 @@ endif FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) -CEXTRALIB = - +ifeq ($(USE_OPENMP), 1) +ifeq ($(F_COMPILER), GFORTRAN) +ifeq ($(C_COMPILER), CLANG) +CEXTRALIB = -lomp +endif +endif +endif sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o sblat1 sblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 77a42d84f..1b426afe7 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -25,7 +25,6 @@ endif () # known to hang with the native Windows and Android threads # FIXME needs checking if this works on any of the other platforms -if (NOT NO_CBLAS) if (NOT USE_OPENMP) if (OS_CYGWIN_NT OR OS_LINUX) set(OpenBLAS_utest_src @@ -34,7 +33,6 @@ set(OpenBLAS_utest_src ) endif() endif() -endif() if (NOT NO_LAPACK) set(OpenBLAS_utest_src diff --git a/utest/Makefile b/utest/Makefile index e071540dc..e40b3c6db 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -17,13 +17,11 @@ endif #this does not work with OpenMP nor with native Windows or Android threads # FIXME TBD if this works on OSX, SunOS, POWER and zarch -ifneq ($(NO_CBLAS), 1) ifndef USE_OPENMP ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT)) OBJS += test_fork.o endif endif -endif all : run_test diff --git a/utest/test_fork.c b/utest/test_fork.c index 9e0244305..9fc51287c 100644 --- a/utest/test_fork.c +++ b/utest/test_fork.c @@ -13,9 +13,9 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the OpenBLAS project nor the names of - its contributors may be used to endorse or promote products - derived from this software without specific prior written + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" @@ -48,11 +48,13 @@ void* xmalloc(size_t n) } } -void check_dgemm(double *a, double *b, double *result, double *expected, int n) +void check_dgemm(double *a, double *b, double *result, double *expected, blasint n) { + char trans1 = 'T'; + char trans2 = 'N'; + double zerod = 0, oned = 1; int i; - cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, - 1.0, a, n, b, n, 0.0, result, n); + BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, result, &n); for(i = 0; i < n * n; ++i) { ASSERT_DBL_NEAR_TOL(expected[i], result[i], DOUBLE_EPS); } @@ -60,7 +62,7 @@ void check_dgemm(double *a, double *b, double *result, double *expected, int n) CTEST(fork, safety) { - int n = 1000; + blasint n = 1000; int i; double *a, *b, *c, *d; @@ -84,8 +86,10 @@ CTEST(fork, safety) // Compute a DGEMM product in the parent process prior to forking to // ensure that the OpenBLAS thread pool is initialized. - cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, - 1.0, a, n, b, n, 0.0, c, n); + char trans1 = 'T'; + char trans2 = 'N'; + double zerod = 0, oned = 1; + BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, c, &n); fork_pid = fork(); if (fork_pid == -1) {