Merge pull request #1 from xianyi/develop

Merge xianyi:develop into develop
This commit is contained in:
Martin Kroeker 2018-06-25 20:45:56 +02:00 committed by GitHub
commit c38c65eb65
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 440 additions and 366 deletions

View File

@ -21,6 +21,17 @@ ifeq ($(BUILD_RELAPACK), 1)
RELA = re_lapack
endif
ifeq ($(NO_FORTRAN), 1)
define NOFORTRAN
1
endef
define NO_LAPACK
1
endef
export NOFORTRAN
export NO_LAPACK
endif
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
@ -47,7 +58,7 @@ endif
endif
@echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
ifndef NOFORTRAN
ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"
endif
ifneq ($(OSNAME), AIX)
@ -108,7 +119,7 @@ endif
endif
tests :
ifndef NOFORTRAN
ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
touch $(LIBNAME)
ifndef NO_FBLAS
$(MAKE) -C test all
@ -210,7 +221,7 @@ netlib :
else
netlib : lapack_prebuild
ifndef NOFORTRAN
ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
endif
@ -231,7 +242,10 @@ prof_lapack : lapack_prebuild
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
lapack_prebuild :
ifndef NOFORTRAN
$(info filter value of NOFORTRAN is:)
$(info x$(filter-out $(NOFORTRAN), 1 2)x)
ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
@ -274,21 +288,21 @@ endif
endif
large.tgz :
ifndef NOFORTRAN
ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
if [ ! -a $< ]; then
-wget http://www.netlib.org/lapack/timing/large.tgz;
fi
endif
timing.tgz :
ifndef NOFORTRAN
ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
if [ ! -a $< ]; then
-wget http://www.netlib.org/lapack/timing/timing.tgz;
fi
endif
lapack-timing : large.tgz timing.tgz
ifndef NOFORTRAN
ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
$(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING

View File

@ -60,6 +60,14 @@ VERSION = 0.3.1.dev
# This flag is always set for POWER8. Don't modify the flag
# USE_OPENMP = 1
# The OpenMP scheduler to use - by default this is "static" and you
# will normally not want to change this unless you know that your main
# workload will involve tasks that have highly unbalanced running times
# for individual threads. Changing away from "static" may also adversely
# affect memory access locality in NUMA systems. Setting to "runtime" will
# allow you to select the scheduler from the environment variable OMP_SCHEDULE
# CCOMMON_OPT += -DOMP_SCHED=dynamic
# You can define maximum number of threads. Basically it should be
# less than actual number of cores. If you don't specify one, it's
# automatically detected by the the script.
@ -156,8 +164,11 @@ NO_AFFINITY = 1
# CONSISTENT_FPCSR = 1
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
# with single thread. You can use this flag to avoid the overhead of multi-threading
# in small matrix sizes. The default value is 4.
# with single thread. (Actually in recent versions this is a factor proportional to the
# number of floating point operations necessary for the given problem size, no longer
# an individual dimension). You can use this setting to avoid the overhead of multi-
# threading in small matrix sizes. The default value is 4, but values as high as 50 have
# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
# GEMM_MULTITHREAD_THRESHOLD = 4
# If you need santy check by comparing reference BLAS. It'll be very

View File

@ -248,7 +248,7 @@ endif
ifeq ($(OSNAME), Darwin)
ifndef MACOSX_DEPLOYMENT_TARGET
export MACOSX_DEPLOYMENT_TARGET=10.6
export MACOSX_DEPLOYMENT_TARGET=10.8
endif
MD5SUM = md5 -r
endif

View File

@ -8,6 +8,13 @@ endif
endif
endif
ifeq ($(CORE), SKYLAKEX)
ifndef NO_AVX512
CCOMMON_OPT += -march=skylake-avx512
FCOMMON_OPT += -march=skylake-avx512
endif
endif
ifeq ($(OSNAME), Interix)
ARFLAGS = -m x64
endif

View File

@ -82,6 +82,11 @@ CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);

View File

@ -47,14 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* - large enough to support all architectures and kernel
* Chosing a too small SIZE will lead to a stack smashing.
*/
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
/* make it volatile because some function (ex: dgemv_n.S) */ \
/* do not restore all register */ \
volatile int stack_alloc_size = SIZE; \
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \
stack_alloc_size = 0; \
STACK_ALLOC_PROTECT_SET \
TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
/* make it volatile because some function (ex: dgemv_n.S) */ \
/* do not restore all register */ \
volatile int stack_alloc_size = SIZE; \
if (stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) stack_alloc_size = 0; \
STACK_ALLOC_PROTECT_SET \
/* Avoid declaring an array of length 0 */ \
TYPE stack_buffer[stack_alloc_size ? stack_alloc_size : 1] \
__attribute__((aligned(0x20))); \
BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1);
#else
//Original OpenBLAS/GotoBLAS codes.

View File

@ -60,8 +60,13 @@
#endif
*/
#define MB
#define WMB
#ifdef __GNUC__
#define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
#else
#define MB do {} while (0)
#define WMB do {} while (0)
#endif
static void __inline blas_lock(volatile BLASULONG *address){

View File

@ -1339,6 +1339,23 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
}
break;
case 6:
switch (model) {
case 6: // Cannon Lake
#ifndef NO_AVX512
return CPUTYPE_SKYLAKEX;
#else
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_HASWELL;
#else
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
#endif
}
break;
case 9:
case 8:
switch (model) {

View File

@ -102,7 +102,13 @@ clean ::
rm -f x*
FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
CEXTRALIB =
ifeq ($(USE_OPENMP), 1)
ifeq ($(F_COMPILER), GFORTRAN)
ifeq ($(C_COMPILER), CLANG)
CEXTRALIB = -lomp
endif
endif
endif
# Single real
xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME)

View File

@ -91,11 +91,7 @@
#endif
typedef struct {
#if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t;
@ -348,12 +344,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {
/* Make sure if no one is using workspace */
START_RPCC();
for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
STOP_RPCC(waiting1);
#if defined(FUSED_GEMM) && !defined(TIMING)
/* Fused operation to copy region of B into workspace and apply kernel */
@ -391,10 +381,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
}
#endif
/* Set flag so other threads can access local region of B */
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++)
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) {
/* Make sure if no one is using workspace */
START_RPCC();
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
STOP_RPCC(waiting1);
/* Set flag so other threads can access local region of B */
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
WMB;
WMB;
}
}
/* Get regions of B from other threads and apply kernel */
@ -413,7 +408,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Wait until other region of B is initialized */
START_RPCC();
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
STOP_RPCC(waiting2);
/* Apply kernel with local region of A and part of other region of B */
@ -430,12 +425,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Clear synchronization flag if this thread is done with other region of B */
if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
WMB;
}
}
} while (current != mypos);
/* Iterate through steps of m
/* Iterate through steps of m
* Note: First step has already been finished */
for(is = m_from + min_i; is < m_to; is += min_i){
min_i = m_to - is;
@ -465,14 +461,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
c, ldc, is, js);
STOP_RPCC(kernel);
#ifdef TIMING
ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l;
#endif
/* Clear synchronization flag if this thread is done with region of B */
if (is + min_i >= m_to) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
WMB;
}
}
@ -492,7 +488,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
START_RPCC();
for (i = 0; i < args -> nthreads; i++) {
for (js = 0; js < DIVIDE_RATE; js++) {
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;};
}
}
STOP_RPCC(waiting3);
@ -658,8 +654,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
}
/* Clear synchronization flags */
for (i = 0; i < MAX_CPU_NUMBER; i++) {
for (j = 0; j < MAX_CPU_NUMBER; j++) {
for (i = 0; i < nthreads; i++) {
for (j = 0; j < nthreads; j++) {
for (k = 0; k < DIVIDE_RATE; k++) {
job[i].working[j][CACHE_LINE_SIZE * k] = 0;
}

View File

@ -48,6 +48,10 @@
#else
#ifndef OMP_SCHED
#define OMP_SCHED static
#endif
int blas_server_avail = 0;
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
@ -331,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
break;
}
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(OMP_SCHED)
for (i = 0; i < num; i ++) {
#ifndef USE_SIMPLE_THREADED_LEVEL3

View File

@ -338,6 +338,23 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM;
}
return NULL;
case 6:
if (model == 6) {
// Cannon Lake
#ifndef NO_AVX512
return &gotoblas_SKYLAKEX;
#else
if(support_avx())
#ifndef NO_AVX2
return &gotoblas_HASWELL;
#else
return &gotblas_SANDYBRIDGE;
#endif
else
return &gotoblas_NEHALEM;
#endif
}
return NULL;
case 9:
case 8:
if (model == 14 ) { // Kaby Lake

View File

@ -13,9 +13,9 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
@ -139,6 +139,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FIXED_PAGESIZE 4096
#endif
#ifndef BUFFERS_PER_THREAD
#ifdef USE_OPENMP
#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
#else
#define BUFFERS_PER_THREAD NUM_BUFFERS
#endif
#endif
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
#if defined(_MSC_VER) && !defined(__clang__)
@ -213,7 +221,7 @@ int i,n;
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) return nums;
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
#endif
@ -318,6 +326,8 @@ int goto_get_num_procs (void) {
return blas_cpu_number;
}
static void blas_memory_init();
void openblas_fork_handler()
{
// This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
@ -329,7 +339,7 @@ void openblas_fork_handler()
// implementation of OpenMP.
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
int err;
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, blas_memory_init);
if(err != 0)
openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
#endif
@ -407,16 +417,104 @@ int openblas_get_num_threads(void) {
#endif
}
struct release_t {
void *address;
void (*func)(struct release_t *);
long attr;
};
int hugetlb_allocated = 0;
static struct release_t release_info[NUM_BUFFERS];
static int release_pos = 0;
#if defined(OS_WINDOWS)
#define THREAD_LOCAL __declspec(thread)
#define LIKELY_ONE(x) (x)
#else
#define THREAD_LOCAL __thread
#define LIKELY_ONE(x) (__builtin_expect(x, 1))
#endif
/* Stores information about the allocation and how to release it */
struct alloc_t {
/* Whether this allocation is being used */
int used;
/* Any special attributes needed when releasing this allocation */
int attr;
/* Function that can properly release this memory */
void (*release_func)(struct alloc_t *);
/* Pad to 64-byte alignment */
char pad[64 - 2 * sizeof(int) - sizeof(void(*))];
};
/* Convenience macros for storing release funcs */
#define STORE_RELEASE_FUNC(address, func) \
if (address != (void *)-1) { \
struct alloc_t *alloc_info = (struct alloc_t *)address; \
alloc_info->release_func = func; \
}
#define STORE_RELEASE_FUNC_WITH_ATTR(address, func, attr) \
if (address != (void *)-1) { \
struct alloc_t *alloc_info = (struct alloc_t *)address; \
alloc_info->release_func = func; \
alloc_info->attr = attr; \
}
/* The number of bytes that will be allocated for each buffer. When allocating
memory, we store an alloc_t followed by the actual buffer memory. This means
that each allocation always has its associated alloc_t, without the need
for an auxiliary tracking structure. */
static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t);
/* Clang supports TLS from version 2.8 */
#if defined(__clang__) && __clang_major__ > 2 || \
(__clang_minor__ == 2 || __clang_minor__ == 8)
#define HAS_COMPILER_TLS
#endif
/* GCC supports TLS from version 4.1 */
#if !defined(__clang__) && defined(__GNUC__) && \
(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
#define HAS_COMPILER_TLS
#endif
/* MSVC supports TLS from version 2005 */
#if defined(_MSC_VER) && _MSC_VER >= 1400
#define HAS_COMPILER_TLS
#endif
/* Versions of XCode before 8 did not properly support TLS */
#if defined(__apple_build_version__) && __apple_build_version__ < 8000042
#undef HAS_COMPILER_TLS
#endif
/* Android NDK's before version 12b did not support TLS */
#if defined(__ANDROID__) && defined(__clang__)
#if __has_include(<android/ndk-version.h>)
#include <android/ndk-version.h>
#endif
#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \
defined(__NDK_MINOR__) && \
((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
#undef HAS_COMPILER_TLS
#endif
#endif
/* Holds pointers to allocated memory */
#if defined(SMP) && !defined(USE_OPENMP)
/* This is the number of threads than can be spawned by the server, which is the
server plus the number of threads in the thread pool */
# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER
static int next_memory_table_pos = 0;
# if defined(HAS_COMPILER_TLS)
/* Use compiler generated thread-local-storage */
static int THREAD_LOCAL local_memory_table_pos = 0;
# else
/* Use system-dependent thread-local-storage */
# if defined(OS_WINDOWS)
static DWORD local_storage_key;
# else
static pthread_key_t local_storage_key;
# endif /* defined(OS_WINDOWS) */
# endif /* defined(HAS_COMPILER_TLS) */
#else
/* There is only one allocating thread when in single-threaded mode and when using OpenMP */
# define MAX_ALLOCATING_THREADS 1
#endif /* defined(SMP) && !defined(USE_OPENMP) */
static struct alloc_t * local_memory_table[MAX_ALLOCATING_THREADS][BUFFERS_PER_THREAD];
#if defined(OS_LINUX) && !defined(NO_WARMUP)
static int hot_alloc = 0;
@ -432,11 +530,41 @@ static pthread_spinlock_t alloc_lock = 0;
static BLASULONG alloc_lock = 0UL;
#endif
/* Returns a pointer to the start of the per-thread memory allocation data */
static __inline struct alloc_t ** get_memory_table() {
#if defined(SMP) && !defined(USE_OPENMP)
# if !defined(HAS_COMPILER_TLS)
# if defined(OS_WINDOWS)
int local_memory_table_pos = (int)::TlsGetValue(local_storage_key);
# else
int local_memory_table_pos = (int)pthread_getspecific(local_storage_key);
# endif /* defined(OS_WINDOWS) */
# endif /* !defined(HAS_COMPILER_TLS) */
if (!local_memory_table_pos) {
LOCK_COMMAND(&alloc_lock);
local_memory_table_pos = next_memory_table_pos++;
if (next_memory_table_pos > MAX_ALLOCATING_THREADS)
printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n");
UNLOCK_COMMAND(&alloc_lock);
# if !defined(HAS_COMPILER_TLS)
# if defined(OS_WINDOWS)
::TlsSetValue(local_storage_key, (void*)local_memory_table_pos);
# else
pthread_setspecific(local_storage_key, (void*)local_memory_table_pos);
# endif /* defined(OS_WINDOWS) */
# endif /* !defined(HAS_COMPILER_TLS) */
}
return local_memory_table[local_memory_table_pos];
#else
return local_memory_table[0];
#endif /* defined(SMP) && !defined(USE_OPENMP) */
}
#ifdef ALLOC_MMAP
static void alloc_mmap_free(struct release_t *release){
static void alloc_mmap_free(struct alloc_t *alloc_info){
if (munmap(release -> address, BUFFER_SIZE)) {
if (munmap(alloc_info, allocation_block_size)) {
printf("OpenBLAS : munmap failed\n");
}
}
@ -450,28 +578,18 @@ static void *alloc_mmap(void *address){
if (address){
map_address = mmap(address,
BUFFER_SIZE,
allocation_block_size,
MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
} else {
map_address = mmap(address,
BUFFER_SIZE,
allocation_block_size,
MMAP_ACCESS, MMAP_POLICY, -1, 0);
}
if (map_address != (void *)-1) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
}
STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
#ifdef OS_LINUX
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
#endif
return map_address;
@ -524,25 +642,25 @@ static void *alloc_mmap(void *address){
if (address){
/* Just give up use advanced operation */
map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
map_address = mmap(address, allocation_block_size, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
#ifdef OS_LINUX
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
#endif
} else {
#if defined(OS_LINUX) && !defined(NO_WARMUP)
if (hot_alloc == 0) {
map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0);
map_address = mmap(NULL, allocation_block_size, MMAP_ACCESS, MMAP_POLICY, -1, 0);
#ifdef OS_LINUX
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
#endif
} else {
#endif
map_address = mmap(NULL, BUFFER_SIZE * SCALING,
map_address = mmap(NULL, allocation_block_size * SCALING,
MMAP_ACCESS, MMAP_POLICY, -1, 0);
if (map_address != (void *)-1) {
@ -550,7 +668,7 @@ static void *alloc_mmap(void *address){
#ifdef OS_LINUX
#ifdef DEBUG
int ret=0;
ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
if(ret==-1){
int errsv=errno;
perror("OpenBLAS alloc_mmap:");
@ -558,7 +676,7 @@ static void *alloc_mmap(void *address){
}
#else
my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
#endif
#endif
@ -566,7 +684,7 @@ static void *alloc_mmap(void *address){
allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
start = (BLASULONG)map_address;
current = (SCALING - 1) * BUFFER_SIZE;
current = (SCALING - 1) * allocation_block_size;
while(current > 0) {
*(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
@ -581,7 +699,7 @@ static void *alloc_mmap(void *address){
best = (BLASULONG)-1;
best_address = map_address;
while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) {
current = run_bench(start, allocsize);
@ -597,7 +715,7 @@ static void *alloc_mmap(void *address){
if ((BLASULONG)best_address > (BLASULONG)map_address)
munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address);
munmap((void *)((BLASULONG)best_address + allocation_block_size), (SCALING - 1) * allocation_block_size + (BLASULONG)map_address - (BLASULONG)best_address);
map_address = best_address;
@ -610,17 +728,7 @@ static void *alloc_mmap(void *address){
}
#endif
if (map_address != (void *)-1) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
}
STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
return map_address;
}
@ -632,9 +740,9 @@ static void *alloc_mmap(void *address){
#ifdef ALLOC_MALLOC
static void alloc_malloc_free(struct release_t *release){
static void alloc_malloc_free(struct alloc_t *alloc_info){
free(release -> address);
free(alloc_info);
}
@ -642,15 +750,11 @@ static void *alloc_malloc(void *address){
void *map_address;
map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
map_address = (void *)malloc(allocation_block_size + FIXED_PAGESIZE);
if (map_address == (void *)NULL) map_address = (void *)-1;
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_malloc_free;
release_pos ++;
}
STORE_RELEASE_FUNC(map_address, alloc_malloc_free);
return map_address;
@ -667,24 +771,20 @@ void *qfree (void *address);
#define QCOMMS 0x2
#define QFAST 0x4
static void alloc_qalloc_free(struct release_t *release){
static void alloc_qalloc_free(struct alloc_t *alloc_info){
qfree(release -> address);
qfree(alloc_info);
}
static void *alloc_qalloc(void *address){
void *map_address;
map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE);
map_address = (void *)qalloc(QCOMMS | QFAST, allocation_block_size + FIXED_PAGESIZE);
if (map_address == (void *)NULL) map_address = (void *)-1;
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_qalloc_free;
release_pos ++;
}
STORE_RELEASE_FUNC(map_address, alloc_qalloc_free);
return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
}
@ -693,9 +793,9 @@ static void *alloc_qalloc(void *address){
#ifdef ALLOC_WINDOWS
static void alloc_windows_free(struct release_t *release){
static void alloc_windows_free(struct alloc_t *alloc_info){
VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT);
VirtualFree(alloc_info, allocation_block_size, MEM_DECOMMIT);
}
@ -703,17 +803,13 @@ static void *alloc_windows(void *address){
void *map_address;
map_address = VirtualAlloc(address,
BUFFER_SIZE,
allocation_block_size,
MEM_RESERVE | MEM_COMMIT,
PAGE_READWRITE);
if (map_address == (void *)NULL) map_address = (void *)-1;
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_windows_free;
release_pos ++;
}
STORE_RELEASE_FUNC(map_address, alloc_windows_free);
return map_address;
}
@ -725,13 +821,14 @@ static void *alloc_windows(void *address){
#define DEVICEDRIVER_NAME "/dev/mapper"
#endif
static void alloc_devicedirver_free(struct release_t *release){
static void alloc_devicedirver_free(struct alloc_t *alloc_info){
if (munmap(release -> address, BUFFER_SIZE)) {
int attr = alloc_info -> attr;
if (munmap(address, allocation_block_size)) {
printf("OpenBLAS : Bugphysarea unmap failed.\n");
}
if (close(release -> attr)) {
if (close(attr)) {
printf("OpenBLAS : Bugphysarea close failed.\n");
}
@ -748,17 +845,12 @@ static void *alloc_devicedirver(void *address){
}
map_address = mmap(address, BUFFER_SIZE,
map_address = mmap(address, allocation_block_size,
PROT_READ | PROT_WRITE,
MAP_FILE | MAP_SHARED,
fd, 0);
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].attr = fd;
release_info[release_pos].func = alloc_devicedirver_free;
release_pos ++;
}
STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_devicedirver_free, fd);
return map_address;
}
@ -767,9 +859,9 @@ static void *alloc_devicedirver(void *address){
#ifdef ALLOC_SHM
static void alloc_shm_free(struct release_t *release){
static void alloc_shm_free(struct alloc_t *alloc_info){
if (shmdt(release -> address)) {
if (shmdt(alloc_info)) {
printf("OpenBLAS : Shared memory unmap failed.\n");
}
}
@ -778,22 +870,21 @@ static void *alloc_shm(void *address){
void *map_address;
int shmid;
shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600);
shmid = shmget(IPC_PRIVATE, allocation_block_size,IPC_CREAT | 0600);
map_address = (void *)shmat(shmid, address, 0);
if (map_address != (void *)-1){
#ifdef OS_LINUX
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
#endif
shmctl(shmid, IPC_RMID, 0);
release_info[release_pos].address = map_address;
release_info[release_pos].attr = shmid;
release_info[release_pos].func = alloc_shm_free;
release_pos ++;
struct alloc_t *alloc_info = (struct alloc_t *)map_address;
alloc_info->release_func = alloc_shm_free;
alloc_info->attr = shmid;
}
return map_address;
@ -801,23 +892,23 @@ static void *alloc_shm(void *address){
#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
static void alloc_hugetlb_free(struct release_t *release){
static void alloc_hugetlb_free(struct alloc_t *alloc_info){
#if defined(OS_LINUX) || defined(OS_AIX)
if (shmdt(release -> address)) {
if (shmdt(alloc_info)) {
printf("OpenBLAS : Hugepage unmap failed.\n");
}
#endif
#ifdef __sun__
munmap(release -> address, BUFFER_SIZE);
munmap(alloc_info, allocation_block_size);
#endif
#ifdef OS_WINDOWS
VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT);
VirtualFree(alloc_info, allocation_block_size, MEM_LARGE_PAGES | MEM_DECOMMIT);
#endif
@ -830,7 +921,7 @@ static void *alloc_hugetlb(void *address){
#if defined(OS_LINUX) || defined(OS_AIX)
int shmid;
shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,
shmid = shmget(IPC_PRIVATE, allocation_block_size,
#ifdef OS_LINUX
SHM_HUGETLB |
#endif
@ -843,7 +934,7 @@ static void *alloc_hugetlb(void *address){
map_address = (void *)shmat(shmid, address, SHM_RND);
#ifdef OS_LINUX
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
#endif
if (map_address != (void *)-1){
@ -860,7 +951,7 @@ static void *alloc_hugetlb(void *address){
mha.mha_pagesize = HUGE_PAGESIZE;
memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE);
map_address = (BLASULONG)memalign(HUGE_PAGESIZE, allocation_block_size);
#endif
#ifdef OS_WINDOWS
@ -872,7 +963,7 @@ static void *alloc_hugetlb(void *address){
tp.PrivilegeCount = 1;
tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
CloseHandle(hToken);
return (void*)-1;
@ -884,7 +975,7 @@ static void *alloc_hugetlb(void *address){
}
map_address = (void *)VirtualAlloc(address,
BUFFER_SIZE,
allocation_block_size,
MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
PAGE_READWRITE);
@ -895,11 +986,7 @@ static void *alloc_hugetlb(void *address){
#endif
if (map_address != (void *)-1){
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_hugetlb_free;
release_pos ++;
}
STORE_RELEASE_FUNC(map_address, alloc_hugetlb_free);
return map_address;
}
@ -911,13 +998,14 @@ static void *alloc_hugetlb(void *address){
static int hugetlb_pid = 0;
static void alloc_hugetlbfile_free(struct release_t *release){
static void alloc_hugetlbfile_free(struct alloc_t *alloc_info){
if (munmap(release -> address, BUFFER_SIZE)) {
int attr = alloc_info -> attr;
if (munmap(alloc_info, allocation_block_size)) {
printf("OpenBLAS : HugeTLBfs unmap failed.\n");
}
if (close(release -> attr)) {
if (close(attr)) {
printf("OpenBLAS : HugeTLBfs close failed.\n");
}
}
@ -938,17 +1026,12 @@ static void *alloc_hugetlbfile(void *address){
unlink(filename);
map_address = mmap(address, BUFFER_SIZE,
map_address = mmap(address, allocation_block_size,
PROT_READ | PROT_WRITE,
MAP_SHARED,
fd, 0);
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].attr = fd;
release_info[release_pos].func = alloc_hugetlbfile_free;
release_pos ++;
}
STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_hugetlbfile_free, fd);
return map_address;
}
@ -961,35 +1044,35 @@ static BLASULONG base_address = 0UL;
static BLASULONG base_address = BASE_ADDRESS;
#endif
static volatile struct {
BLASULONG lock;
void *addr;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
int pos;
#endif
int used;
#ifndef __64BIT__
char dummy[48];
#if __STDC_VERSION__ >= 201112L
static _Atomic int memory_initialized = 0;
#else
char dummy[40];
static volatile int memory_initialized = 0;
#endif
} memory[NUM_BUFFERS];
static int memory_initialized = 0;
/* Memory allocation routine */
/* procpos ... indicates where it comes from */
/* 0 : Level 3 functions */
/* 1 : Level 2 functions */
/* 2 : Thread */
static void blas_memory_init(){
#if defined(SMP) && !defined(USE_OPENMP)
next_memory_table_pos = 0;
# if !defined(HAS_COMPILER_TLS)
# if defined(OS_WINDOWS)
local_storage_key = ::TlsAlloc();
# else
pthread_key_create(&local_storage_key, NULL);
# endif /* defined(OS_WINDOWS) */
# endif /* defined(HAS_COMPILER_TLS) */
#endif /* defined(SMP) && !defined(USE_OPENMP) */
memset(local_memory_table, 0, sizeof(local_memory_table));
}
void *blas_memory_alloc(int procpos){
int position;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
int mypos;
#endif
void *map_address;
@ -1019,103 +1102,54 @@ void *blas_memory_alloc(int procpos){
NULL,
};
void *(**func)(void *address);
struct alloc_t * alloc_info;
struct alloc_t ** alloc_table;
#if defined(USE_OPENMP)
if (!memory_initialized) {
if (!LIKELY_ONE(memory_initialized)) {
#if defined(SMP) && !defined(USE_OPENMP)
/* Only allow a single thread to initialize memory system */
LOCK_COMMAND(&alloc_lock);
if (!memory_initialized) {
#endif
LOCK_COMMAND(&alloc_lock);
if (!memory_initialized) {
#if defined(WHEREAMI) && !defined(USE_OPENMP)
for (position = 0; position < NUM_BUFFERS; position ++){
memory[position].addr = (void *)0;
memory[position].pos = -1;
memory[position].used = 0;
memory[position].lock = 0;
}
#endif
blas_memory_init();
#ifdef DYNAMIC_ARCH
gotoblas_dynamic_init();
gotoblas_dynamic_init();
#endif
#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
gotoblas_affinity_init();
gotoblas_affinity_init();
#endif
#ifdef SMP
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
#endif
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
#ifndef DYNAMIC_ARCH
blas_set_parameter();
blas_set_parameter();
#endif
#endif
memory_initialized = 1;
memory_initialized = 1;
}
UNLOCK_COMMAND(&alloc_lock);
#if defined(USE_OPENMP)
}
#if defined(SMP) && !defined(USE_OPENMP)
}
UNLOCK_COMMAND(&alloc_lock);
#endif
}
#ifdef DEBUG
printf("Alloc Start ...\n");
#endif
#if defined(WHEREAMI) && !defined(USE_OPENMP)
mypos = WhereAmI();
position = mypos;
while (position >= NUM_BUFFERS) position >>= 1;
do {
if (!memory[position].used && (memory[position].pos == mypos)) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#else
blas_lock(&memory[position].lock);
#endif
if (!memory[position].used) goto allocation;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#else
blas_unlock(&memory[position].lock);
#endif
}
position ++;
} while (position < NUM_BUFFERS);
#endif
position = 0;
alloc_table = get_memory_table();
do {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#else
if (!memory[position].used) {
blas_lock(&memory[position].lock);
#endif
if (!memory[position].used) goto allocation;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#else
blas_unlock(&memory[position].lock);
}
#endif
if (!alloc_table[position] || !alloc_table[position]->used) goto allocation;
position ++;
} while (position < NUM_BUFFERS);
} while (position < BUFFERS_PER_THREAD);
goto error;
@ -1125,14 +1159,8 @@ void *blas_memory_alloc(int procpos){
printf(" Position -> %d\n", position);
#endif
memory[position].used = 1;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#else
blas_unlock(&memory[position].lock);
#endif
if (!memory[position].addr) {
alloc_info = alloc_table[position];
if (!alloc_info) {
do {
#ifdef DEBUG
printf("Allocation Start : %lx\n", base_address);
@ -1144,18 +1172,18 @@ void *blas_memory_alloc(int procpos){
while ((func != NULL) && (map_address == (void *) -1)) {
map_address = (*func)((void *)base_address);
map_address = (*func)((void *)base_address);
#ifdef ALLOC_DEVICEDRIVER
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n");
}
#endif
#ifdef ALLOC_HUGETLBFILE
if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
#ifndef OS_WINDOWS
fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n");
#endif
}
#endif
@ -1172,89 +1200,44 @@ void *blas_memory_alloc(int procpos){
#endif
if (((BLASLONG) map_address) == -1) base_address = 0UL;
if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
if (base_address) base_address += allocation_block_size + FIXED_PAGESIZE;
} while ((BLASLONG)map_address == -1);
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
memory[position].addr = map_address;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
alloc_table[position] = alloc_info = map_address;
#ifdef DEBUG
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
printf(" Mapping Succeeded. %p(%d)\n", (void *)alloc_info, position);
#endif
}
#if defined(WHEREAMI) && !defined(USE_OPENMP)
if (memory[position].pos == -1) memory[position].pos = mypos;
#endif
#ifdef DYNAMIC_ARCH
if (memory_initialized == 1) {
LOCK_COMMAND(&alloc_lock);
if (memory_initialized == 1) {
if (!gotoblas) gotoblas_dynamic_init();
memory_initialized = 2;
}
UNLOCK_COMMAND(&alloc_lock);
}
#endif
#ifdef DEBUG
printf("Mapped : %p %3d\n\n",
(void *)memory[position].addr, position);
printf("Mapped : %p %3d\n\n", (void *)alloc_info, position);
#endif
return (void *)memory[position].addr;
alloc_info->used = 1;
return (void *)(((char *)alloc_info) + sizeof(struct alloc_t));
error:
printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
return NULL;
}
void blas_memory_free(void *free_area){
void blas_memory_free(void *buffer){
#ifdef DEBUG
int position;
struct alloc_t ** alloc_table;
#endif
/* Since we passed an offset pointer to the caller, get back to the actual allocation */
struct alloc_t *alloc_info = (void *)(((char *)buffer) - sizeof(struct alloc_t));
#ifdef DEBUG
printf("Unmapped Start : %p ...\n", free_area);
printf("Unmapped Start : %p ...\n", alloc_info);
#endif
position = 0;
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
position++;
if (memory[position].addr != free_area) goto error;
#ifdef DEBUG
printf(" Position : %d\n", position);
#endif
// arm: ensure all writes are finished before other thread takes this memory
WMB;
memory[position].used = 0;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
alloc_info->used = 0;
#ifdef DEBUG
printf("Unmap Succeeded.\n\n");
@ -1262,15 +1245,13 @@ void blas_memory_free(void *free_area){
return;
error:
printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
#ifdef DEBUG
for (position = 0; position < NUM_BUFFERS; position++)
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
#endif
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
alloc_table = get_memory_table();
for (position = 0; position < BUFFERS_PER_THREAD; position++){
if (alloc_table[position]) {
printf("%4ld %p : %d\n", position, alloc_table[position], alloc_table[position]->used);
}
}
#endif
return;
}
@ -1287,16 +1268,20 @@ void blas_memory_free_nolock(void * map_address) {
void blas_shutdown(void){
int pos;
int pos, thread;
#ifdef SMP
BLASFUNC(blas_thread_shutdown)();
#endif
LOCK_COMMAND(&alloc_lock);
for (pos = 0; pos < release_pos; pos ++) {
release_info[pos].func(&release_info[pos]);
for (thread = 0; thread < MAX_ALLOCATING_THREADS; thread ++){
for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){
struct alloc_t *alloc_info = local_memory_table[thread][pos];
if (alloc_info) {
alloc_info->release_func(alloc_info);
alloc_info = (void *)0;
}
}
}
#ifdef SEEK_ADDRESS
@ -1305,17 +1290,6 @@ void blas_shutdown(void){
base_address = BASE_ADDRESS;
#endif
for (pos = 0; pos < NUM_BUFFERS; pos ++){
memory[pos].addr = (void *)0;
memory[pos].used = 0;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
memory[pos].pos = -1;
#endif
memory[pos].lock = 0;
}
UNLOCK_COMMAND(&alloc_lock);
return;
}
@ -1339,7 +1313,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
size_t size;
BLASULONG buffer;
size = BUFFER_SIZE - PAGESIZE;
size = allocation_block_size - PAGESIZE;
buffer = (BLASULONG)sa + GEMM_OFFSET_A;
#if defined(OS_LINUX) && !defined(NO_WARMUP)
@ -1360,7 +1334,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
UNLOCK_COMMAND(&init_lock);
#endif
size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE);
size = MIN((allocation_block_size - PAGESIZE), L2_SIZE);
buffer = (BLASULONG)sa + GEMM_OFFSET_A;
while (size > 0) {

View File

@ -260,7 +260,7 @@ HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \
idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX)
CSBLAS1OBJS = \
cblas_isamax.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX)
@ -277,7 +277,7 @@ CSBLAS3OBJS = \
cblas_sgeadd.$(SUFFIX)
CDBLAS1OBJS = \
cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX)
@ -294,7 +294,7 @@ CDBLAS3OBJS += \
cblas_dgeadd.$(SUFFIX)
CCBLAS1OBJS = \
cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
cblas_ccopy.$(SUFFIX) \
cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
@ -320,7 +320,7 @@ CCBLAS3OBJS = \
CZBLAS1OBJS = \
cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
cblas_zcopy.$(SUFFIX) \
cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
@ -1359,6 +1359,18 @@ cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c
cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
cblas_isamin.$(SUFFIX) cblas_isamin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_idamin.$(SUFFIX) cblas_idamin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_icamin.$(SUFFIX) cblas_icamin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_izamin.$(SUFFIX) cblas_izamin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)

View File

@ -1507,7 +1507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 8
#define SWITCH_RATIO 4
#define SWITCH_RATIO 32
#ifdef ARCH_X86
@ -1626,7 +1626,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 8
#define SWITCH_RATIO 4
#define SWITCH_RATIO 32
#ifdef ARCH_X86

View File

@ -122,8 +122,13 @@ endif
FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
CEXTRALIB =
ifeq ($(USE_OPENMP), 1)
ifeq ($(F_COMPILER), GFORTRAN)
ifeq ($(C_COMPILER), CLANG)
CEXTRALIB = -lomp
endif
endif
endif
sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME)
$(FC) $(FLDFLAGS) -o sblat1 sblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)

View File

@ -25,7 +25,6 @@ endif ()
# known to hang with the native Windows and Android threads
# FIXME needs checking if this works on any of the other platforms
if (NOT NO_CBLAS)
if (NOT USE_OPENMP)
if (OS_CYGWIN_NT OR OS_LINUX)
set(OpenBLAS_utest_src
@ -34,7 +33,6 @@ set(OpenBLAS_utest_src
)
endif()
endif()
endif()
if (NOT NO_LAPACK)
set(OpenBLAS_utest_src

View File

@ -17,13 +17,11 @@ endif
#this does not work with OpenMP nor with native Windows or Android threads
# FIXME TBD if this works on OSX, SunOS, POWER and zarch
ifneq ($(NO_CBLAS), 1)
ifndef USE_OPENMP
ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT))
OBJS += test_fork.o
endif
endif
endif
all : run_test

View File

@ -13,9 +13,9 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
@ -48,11 +48,13 @@ void* xmalloc(size_t n)
}
}
void check_dgemm(double *a, double *b, double *result, double *expected, int n)
void check_dgemm(double *a, double *b, double *result, double *expected, blasint n)
{
char trans1 = 'T';
char trans2 = 'N';
double zerod = 0, oned = 1;
int i;
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n,
1.0, a, n, b, n, 0.0, result, n);
BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, result, &n);
for(i = 0; i < n * n; ++i) {
ASSERT_DBL_NEAR_TOL(expected[i], result[i], DOUBLE_EPS);
}
@ -60,7 +62,7 @@ void check_dgemm(double *a, double *b, double *result, double *expected, int n)
CTEST(fork, safety)
{
int n = 1000;
blasint n = 1000;
int i;
double *a, *b, *c, *d;
@ -84,8 +86,10 @@ CTEST(fork, safety)
// Compute a DGEMM product in the parent process prior to forking to
// ensure that the OpenBLAS thread pool is initialized.
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n,
1.0, a, n, b, n, 0.0, c, n);
char trans1 = 'T';
char trans2 = 'N';
double zerod = 0, oned = 1;
BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, c, &n);
fork_pid = fork();
if (fork_pid == -1) {