Merge pull request #1 from xianyi/develop
Merge xianyi:develop into develop
This commit is contained in:
commit
c38c65eb65
28
Makefile
28
Makefile
|
@ -21,6 +21,17 @@ ifeq ($(BUILD_RELAPACK), 1)
|
|||
RELA = re_lapack
|
||||
endif
|
||||
|
||||
ifeq ($(NO_FORTRAN), 1)
|
||||
define NOFORTRAN
|
||||
1
|
||||
endef
|
||||
define NO_LAPACK
|
||||
1
|
||||
endef
|
||||
export NOFORTRAN
|
||||
export NO_LAPACK
|
||||
endif
|
||||
|
||||
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
|
||||
|
||||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
|
||||
|
@ -47,7 +58,7 @@ endif
|
|||
endif
|
||||
|
||||
@echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
|
||||
ifndef NOFORTRAN
|
||||
ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
|
||||
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"
|
||||
endif
|
||||
ifneq ($(OSNAME), AIX)
|
||||
|
@ -108,7 +119,7 @@ endif
|
|||
endif
|
||||
|
||||
tests :
|
||||
ifndef NOFORTRAN
|
||||
ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
|
||||
touch $(LIBNAME)
|
||||
ifndef NO_FBLAS
|
||||
$(MAKE) -C test all
|
||||
|
@ -210,7 +221,7 @@ netlib :
|
|||
|
||||
else
|
||||
netlib : lapack_prebuild
|
||||
ifndef NOFORTRAN
|
||||
ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
|
||||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
|
||||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
|
||||
endif
|
||||
|
@ -231,7 +242,10 @@ prof_lapack : lapack_prebuild
|
|||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
|
||||
|
||||
lapack_prebuild :
|
||||
ifndef NOFORTRAN
|
||||
$(info filter value of NOFORTRAN is:)
|
||||
$(info x$(filter-out $(NOFORTRAN), 1 2)x)
|
||||
|
||||
ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
|
||||
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
|
@ -274,21 +288,21 @@ endif
|
|||
endif
|
||||
|
||||
large.tgz :
|
||||
ifndef NOFORTRAN
|
||||
ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
|
||||
if [ ! -a $< ]; then
|
||||
-wget http://www.netlib.org/lapack/timing/large.tgz;
|
||||
fi
|
||||
endif
|
||||
|
||||
timing.tgz :
|
||||
ifndef NOFORTRAN
|
||||
ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
|
||||
if [ ! -a $< ]; then
|
||||
-wget http://www.netlib.org/lapack/timing/timing.tgz;
|
||||
fi
|
||||
endif
|
||||
|
||||
lapack-timing : large.tgz timing.tgz
|
||||
ifndef NOFORTRAN
|
||||
ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
|
||||
(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
|
||||
(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
|
||||
$(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING
|
||||
|
|
|
@ -60,6 +60,14 @@ VERSION = 0.3.1.dev
|
|||
# This flag is always set for POWER8. Don't modify the flag
|
||||
# USE_OPENMP = 1
|
||||
|
||||
# The OpenMP scheduler to use - by default this is "static" and you
|
||||
# will normally not want to change this unless you know that your main
|
||||
# workload will involve tasks that have highly unbalanced running times
|
||||
# for individual threads. Changing away from "static" may also adversely
|
||||
# affect memory access locality in NUMA systems. Setting to "runtime" will
|
||||
# allow you to select the scheduler from the environment variable OMP_SCHEDULE
|
||||
# CCOMMON_OPT += -DOMP_SCHED=dynamic
|
||||
|
||||
# You can define maximum number of threads. Basically it should be
|
||||
# less than actual number of cores. If you don't specify one, it's
|
||||
# automatically detected by the the script.
|
||||
|
@ -156,8 +164,11 @@ NO_AFFINITY = 1
|
|||
# CONSISTENT_FPCSR = 1
|
||||
|
||||
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
|
||||
# with single thread. You can use this flag to avoid the overhead of multi-threading
|
||||
# in small matrix sizes. The default value is 4.
|
||||
# with single thread. (Actually in recent versions this is a factor proportional to the
|
||||
# number of floating point operations necessary for the given problem size, no longer
|
||||
# an individual dimension). You can use this setting to avoid the overhead of multi-
|
||||
# threading in small matrix sizes. The default value is 4, but values as high as 50 have
|
||||
# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
|
||||
# GEMM_MULTITHREAD_THRESHOLD = 4
|
||||
|
||||
# If you need santy check by comparing reference BLAS. It'll be very
|
||||
|
|
|
@ -248,7 +248,7 @@ endif
|
|||
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
ifndef MACOSX_DEPLOYMENT_TARGET
|
||||
export MACOSX_DEPLOYMENT_TARGET=10.6
|
||||
export MACOSX_DEPLOYMENT_TARGET=10.8
|
||||
endif
|
||||
MD5SUM = md5 -r
|
||||
endif
|
||||
|
|
|
@ -8,6 +8,13 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), SKYLAKEX)
|
||||
ifndef NO_AVX512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), Interix)
|
||||
ARFLAGS = -m x64
|
||||
endif
|
||||
|
|
5
cblas.h
5
cblas.h
|
@ -82,6 +82,11 @@ CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
|
|||
CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
|
|
|
@ -47,14 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
* - large enough to support all architectures and kernel
|
||||
* Chosing a too small SIZE will lead to a stack smashing.
|
||||
*/
|
||||
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
|
||||
/* make it volatile because some function (ex: dgemv_n.S) */ \
|
||||
/* do not restore all register */ \
|
||||
volatile int stack_alloc_size = SIZE; \
|
||||
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \
|
||||
stack_alloc_size = 0; \
|
||||
STACK_ALLOC_PROTECT_SET \
|
||||
TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \
|
||||
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
|
||||
/* make it volatile because some function (ex: dgemv_n.S) */ \
|
||||
/* do not restore all register */ \
|
||||
volatile int stack_alloc_size = SIZE; \
|
||||
if (stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) stack_alloc_size = 0; \
|
||||
STACK_ALLOC_PROTECT_SET \
|
||||
/* Avoid declaring an array of length 0 */ \
|
||||
TYPE stack_buffer[stack_alloc_size ? stack_alloc_size : 1] \
|
||||
__attribute__((aligned(0x20))); \
|
||||
BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1);
|
||||
#else
|
||||
//Original OpenBLAS/GotoBLAS codes.
|
||||
|
|
|
@ -60,8 +60,13 @@
|
|||
#endif
|
||||
*/
|
||||
|
||||
#define MB
|
||||
#define WMB
|
||||
#ifdef __GNUC__
|
||||
#define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
|
||||
#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
|
||||
#else
|
||||
#define MB do {} while (0)
|
||||
#define WMB do {} while (0)
|
||||
#endif
|
||||
|
||||
static void __inline blas_lock(volatile BLASULONG *address){
|
||||
|
||||
|
|
17
cpuid_x86.c
17
cpuid_x86.c
|
@ -1339,6 +1339,23 @@ int get_cpuname(void){
|
|||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 6:
|
||||
switch (model) {
|
||||
case 6: // Cannon Lake
|
||||
#ifndef NO_AVX512
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
#endif
|
||||
}
|
||||
break;
|
||||
case 9:
|
||||
case 8:
|
||||
switch (model) {
|
||||
|
|
|
@ -102,7 +102,13 @@ clean ::
|
|||
rm -f x*
|
||||
|
||||
FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
|
||||
CEXTRALIB =
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
CEXTRALIB = -lomp
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# Single real
|
||||
xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME)
|
||||
|
|
|
@ -91,11 +91,7 @@
|
|||
#endif
|
||||
|
||||
typedef struct {
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
_Atomic
|
||||
#else
|
||||
volatile
|
||||
#endif
|
||||
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
} job_t;
|
||||
|
||||
|
@ -348,12 +344,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
|
||||
for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {
|
||||
|
||||
/* Make sure if no one is using workspace */
|
||||
START_RPCC();
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
|
||||
STOP_RPCC(waiting1);
|
||||
|
||||
#if defined(FUSED_GEMM) && !defined(TIMING)
|
||||
|
||||
/* Fused operation to copy region of B into workspace and apply kernel */
|
||||
|
@ -391,10 +381,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
}
|
||||
#endif
|
||||
|
||||
/* Set flag so other threads can access local region of B */
|
||||
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++)
|
||||
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) {
|
||||
/* Make sure if no one is using workspace */
|
||||
START_RPCC();
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||
STOP_RPCC(waiting1);
|
||||
/* Set flag so other threads can access local region of B */
|
||||
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
||||
WMB;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
|
||||
/* Get regions of B from other threads and apply kernel */
|
||||
|
@ -413,7 +408,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
/* Wait until other region of B is initialized */
|
||||
START_RPCC();
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
|
||||
STOP_RPCC(waiting2);
|
||||
|
||||
/* Apply kernel with local region of A and part of other region of B */
|
||||
|
@ -430,12 +425,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
/* Clear synchronization flag if this thread is done with other region of B */
|
||||
if (m_to - m_from == min_i) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
} while (current != mypos);
|
||||
|
||||
/* Iterate through steps of m
|
||||
/* Iterate through steps of m
|
||||
* Note: First step has already been finished */
|
||||
for(is = m_from + min_i; is < m_to; is += min_i){
|
||||
min_i = m_to - is;
|
||||
|
@ -465,14 +461,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
|
||||
c, ldc, is, js);
|
||||
STOP_RPCC(kernel);
|
||||
|
||||
|
||||
#ifdef TIMING
|
||||
ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l;
|
||||
#endif
|
||||
|
||||
|
||||
/* Clear synchronization flag if this thread is done with region of B */
|
||||
if (is + min_i >= m_to) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
|
@ -492,7 +488,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
START_RPCC();
|
||||
for (i = 0; i < args -> nthreads; i++) {
|
||||
for (js = 0; js < DIVIDE_RATE; js++) {
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;};
|
||||
}
|
||||
}
|
||||
STOP_RPCC(waiting3);
|
||||
|
@ -658,8 +654,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
|||
}
|
||||
|
||||
/* Clear synchronization flags */
|
||||
for (i = 0; i < MAX_CPU_NUMBER; i++) {
|
||||
for (j = 0; j < MAX_CPU_NUMBER; j++) {
|
||||
for (i = 0; i < nthreads; i++) {
|
||||
for (j = 0; j < nthreads; j++) {
|
||||
for (k = 0; k < DIVIDE_RATE; k++) {
|
||||
job[i].working[j][CACHE_LINE_SIZE * k] = 0;
|
||||
}
|
||||
|
|
|
@ -48,6 +48,10 @@
|
|||
|
||||
#else
|
||||
|
||||
#ifndef OMP_SCHED
|
||||
#define OMP_SCHED static
|
||||
#endif
|
||||
|
||||
int blas_server_avail = 0;
|
||||
|
||||
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
|
||||
|
@ -331,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
|||
break;
|
||||
}
|
||||
|
||||
#pragma omp parallel for schedule(static)
|
||||
#pragma omp parallel for schedule(OMP_SCHED)
|
||||
for (i = 0; i < num; i ++) {
|
||||
|
||||
#ifndef USE_SIMPLE_THREADED_LEVEL3
|
||||
|
|
|
@ -338,6 +338,23 @@ static gotoblas_t *get_coretype(void){
|
|||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
return NULL;
|
||||
case 6:
|
||||
if (model == 6) {
|
||||
// Cannon Lake
|
||||
#ifndef NO_AVX512
|
||||
return &gotoblas_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return &gotoblas_HASWELL;
|
||||
#else
|
||||
return &gotblas_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return &gotoblas_NEHALEM;
|
||||
#endif
|
||||
}
|
||||
return NULL;
|
||||
case 9:
|
||||
case 8:
|
||||
if (model == 14 ) { // Kaby Lake
|
||||
|
|
|
@ -13,9 +13,9 @@ met:
|
|||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
|
@ -139,6 +139,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FIXED_PAGESIZE 4096
|
||||
#endif
|
||||
|
||||
#ifndef BUFFERS_PER_THREAD
|
||||
#ifdef USE_OPENMP
|
||||
#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
|
||||
#else
|
||||
#define BUFFERS_PER_THREAD NUM_BUFFERS
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
|
@ -213,7 +221,7 @@ int i,n;
|
|||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) return nums;
|
||||
ret = CPU_COUNT_S(size,cpusetp);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
#endif
|
||||
|
@ -318,6 +326,8 @@ int goto_get_num_procs (void) {
|
|||
return blas_cpu_number;
|
||||
}
|
||||
|
||||
static void blas_memory_init();
|
||||
|
||||
void openblas_fork_handler()
|
||||
{
|
||||
// This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
|
||||
|
@ -329,7 +339,7 @@ void openblas_fork_handler()
|
|||
// implementation of OpenMP.
|
||||
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
|
||||
int err;
|
||||
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
|
||||
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, blas_memory_init);
|
||||
if(err != 0)
|
||||
openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
|
||||
#endif
|
||||
|
@ -407,16 +417,104 @@ int openblas_get_num_threads(void) {
|
|||
#endif
|
||||
}
|
||||
|
||||
struct release_t {
|
||||
void *address;
|
||||
void (*func)(struct release_t *);
|
||||
long attr;
|
||||
};
|
||||
|
||||
int hugetlb_allocated = 0;
|
||||
|
||||
static struct release_t release_info[NUM_BUFFERS];
|
||||
static int release_pos = 0;
|
||||
#if defined(OS_WINDOWS)
|
||||
#define THREAD_LOCAL __declspec(thread)
|
||||
#define LIKELY_ONE(x) (x)
|
||||
#else
|
||||
#define THREAD_LOCAL __thread
|
||||
#define LIKELY_ONE(x) (__builtin_expect(x, 1))
|
||||
#endif
|
||||
|
||||
/* Stores information about the allocation and how to release it */
|
||||
struct alloc_t {
|
||||
/* Whether this allocation is being used */
|
||||
int used;
|
||||
/* Any special attributes needed when releasing this allocation */
|
||||
int attr;
|
||||
/* Function that can properly release this memory */
|
||||
void (*release_func)(struct alloc_t *);
|
||||
/* Pad to 64-byte alignment */
|
||||
char pad[64 - 2 * sizeof(int) - sizeof(void(*))];
|
||||
};
|
||||
|
||||
/* Convenience macros for storing release funcs */
|
||||
#define STORE_RELEASE_FUNC(address, func) \
|
||||
if (address != (void *)-1) { \
|
||||
struct alloc_t *alloc_info = (struct alloc_t *)address; \
|
||||
alloc_info->release_func = func; \
|
||||
}
|
||||
|
||||
#define STORE_RELEASE_FUNC_WITH_ATTR(address, func, attr) \
|
||||
if (address != (void *)-1) { \
|
||||
struct alloc_t *alloc_info = (struct alloc_t *)address; \
|
||||
alloc_info->release_func = func; \
|
||||
alloc_info->attr = attr; \
|
||||
}
|
||||
|
||||
/* The number of bytes that will be allocated for each buffer. When allocating
|
||||
memory, we store an alloc_t followed by the actual buffer memory. This means
|
||||
that each allocation always has its associated alloc_t, without the need
|
||||
for an auxiliary tracking structure. */
|
||||
static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t);
|
||||
|
||||
/* Clang supports TLS from version 2.8 */
|
||||
#if defined(__clang__) && __clang_major__ > 2 || \
|
||||
(__clang_minor__ == 2 || __clang_minor__ == 8)
|
||||
#define HAS_COMPILER_TLS
|
||||
#endif
|
||||
|
||||
/* GCC supports TLS from version 4.1 */
|
||||
#if !defined(__clang__) && defined(__GNUC__) && \
|
||||
(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
|
||||
#define HAS_COMPILER_TLS
|
||||
#endif
|
||||
|
||||
/* MSVC supports TLS from version 2005 */
|
||||
#if defined(_MSC_VER) && _MSC_VER >= 1400
|
||||
#define HAS_COMPILER_TLS
|
||||
#endif
|
||||
|
||||
/* Versions of XCode before 8 did not properly support TLS */
|
||||
#if defined(__apple_build_version__) && __apple_build_version__ < 8000042
|
||||
#undef HAS_COMPILER_TLS
|
||||
#endif
|
||||
|
||||
/* Android NDK's before version 12b did not support TLS */
|
||||
#if defined(__ANDROID__) && defined(__clang__)
|
||||
#if __has_include(<android/ndk-version.h>)
|
||||
#include <android/ndk-version.h>
|
||||
#endif
|
||||
#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \
|
||||
defined(__NDK_MINOR__) && \
|
||||
((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
|
||||
#undef HAS_COMPILER_TLS
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* Holds pointers to allocated memory */
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
/* This is the number of threads than can be spawned by the server, which is the
|
||||
server plus the number of threads in the thread pool */
|
||||
# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER
|
||||
static int next_memory_table_pos = 0;
|
||||
# if defined(HAS_COMPILER_TLS)
|
||||
/* Use compiler generated thread-local-storage */
|
||||
static int THREAD_LOCAL local_memory_table_pos = 0;
|
||||
# else
|
||||
/* Use system-dependent thread-local-storage */
|
||||
# if defined(OS_WINDOWS)
|
||||
static DWORD local_storage_key;
|
||||
# else
|
||||
static pthread_key_t local_storage_key;
|
||||
# endif /* defined(OS_WINDOWS) */
|
||||
# endif /* defined(HAS_COMPILER_TLS) */
|
||||
#else
|
||||
/* There is only one allocating thread when in single-threaded mode and when using OpenMP */
|
||||
# define MAX_ALLOCATING_THREADS 1
|
||||
#endif /* defined(SMP) && !defined(USE_OPENMP) */
|
||||
static struct alloc_t * local_memory_table[MAX_ALLOCATING_THREADS][BUFFERS_PER_THREAD];
|
||||
|
||||
#if defined(OS_LINUX) && !defined(NO_WARMUP)
|
||||
static int hot_alloc = 0;
|
||||
|
@ -432,11 +530,41 @@ static pthread_spinlock_t alloc_lock = 0;
|
|||
static BLASULONG alloc_lock = 0UL;
|
||||
#endif
|
||||
|
||||
/* Returns a pointer to the start of the per-thread memory allocation data */
|
||||
static __inline struct alloc_t ** get_memory_table() {
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
# if !defined(HAS_COMPILER_TLS)
|
||||
# if defined(OS_WINDOWS)
|
||||
int local_memory_table_pos = (int)::TlsGetValue(local_storage_key);
|
||||
# else
|
||||
int local_memory_table_pos = (int)pthread_getspecific(local_storage_key);
|
||||
# endif /* defined(OS_WINDOWS) */
|
||||
# endif /* !defined(HAS_COMPILER_TLS) */
|
||||
if (!local_memory_table_pos) {
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
local_memory_table_pos = next_memory_table_pos++;
|
||||
if (next_memory_table_pos > MAX_ALLOCATING_THREADS)
|
||||
printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n");
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
# if !defined(HAS_COMPILER_TLS)
|
||||
# if defined(OS_WINDOWS)
|
||||
::TlsSetValue(local_storage_key, (void*)local_memory_table_pos);
|
||||
# else
|
||||
pthread_setspecific(local_storage_key, (void*)local_memory_table_pos);
|
||||
# endif /* defined(OS_WINDOWS) */
|
||||
# endif /* !defined(HAS_COMPILER_TLS) */
|
||||
}
|
||||
return local_memory_table[local_memory_table_pos];
|
||||
#else
|
||||
return local_memory_table[0];
|
||||
#endif /* defined(SMP) && !defined(USE_OPENMP) */
|
||||
}
|
||||
|
||||
#ifdef ALLOC_MMAP
|
||||
|
||||
static void alloc_mmap_free(struct release_t *release){
|
||||
static void alloc_mmap_free(struct alloc_t *alloc_info){
|
||||
|
||||
if (munmap(release -> address, BUFFER_SIZE)) {
|
||||
if (munmap(alloc_info, allocation_block_size)) {
|
||||
printf("OpenBLAS : munmap failed\n");
|
||||
}
|
||||
}
|
||||
|
@ -450,28 +578,18 @@ static void *alloc_mmap(void *address){
|
|||
|
||||
if (address){
|
||||
map_address = mmap(address,
|
||||
BUFFER_SIZE,
|
||||
allocation_block_size,
|
||||
MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
|
||||
} else {
|
||||
map_address = mmap(address,
|
||||
BUFFER_SIZE,
|
||||
allocation_block_size,
|
||||
MMAP_ACCESS, MMAP_POLICY, -1, 0);
|
||||
}
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_mmap_free;
|
||||
release_pos ++;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
}
|
||||
STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
|
||||
|
||||
#ifdef OS_LINUX
|
||||
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
|
||||
my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
|
||||
#endif
|
||||
|
||||
return map_address;
|
||||
|
@ -524,25 +642,25 @@ static void *alloc_mmap(void *address){
|
|||
|
||||
if (address){
|
||||
/* Just give up use advanced operation */
|
||||
map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
|
||||
map_address = mmap(address, allocation_block_size, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
|
||||
|
||||
#ifdef OS_LINUX
|
||||
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
|
||||
my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
|
||||
#endif
|
||||
|
||||
} else {
|
||||
#if defined(OS_LINUX) && !defined(NO_WARMUP)
|
||||
if (hot_alloc == 0) {
|
||||
map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0);
|
||||
map_address = mmap(NULL, allocation_block_size, MMAP_ACCESS, MMAP_POLICY, -1, 0);
|
||||
|
||||
#ifdef OS_LINUX
|
||||
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
|
||||
my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
|
||||
#endif
|
||||
|
||||
} else {
|
||||
#endif
|
||||
|
||||
map_address = mmap(NULL, BUFFER_SIZE * SCALING,
|
||||
map_address = mmap(NULL, allocation_block_size * SCALING,
|
||||
MMAP_ACCESS, MMAP_POLICY, -1, 0);
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
|
@ -550,7 +668,7 @@ static void *alloc_mmap(void *address){
|
|||
#ifdef OS_LINUX
|
||||
#ifdef DEBUG
|
||||
int ret=0;
|
||||
ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
|
||||
ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
|
||||
if(ret==-1){
|
||||
int errsv=errno;
|
||||
perror("OpenBLAS alloc_mmap:");
|
||||
|
@ -558,7 +676,7 @@ static void *alloc_mmap(void *address){
|
|||
}
|
||||
|
||||
#else
|
||||
my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
|
||||
my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -566,7 +684,7 @@ static void *alloc_mmap(void *address){
|
|||
allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
|
||||
|
||||
start = (BLASULONG)map_address;
|
||||
current = (SCALING - 1) * BUFFER_SIZE;
|
||||
current = (SCALING - 1) * allocation_block_size;
|
||||
|
||||
while(current > 0) {
|
||||
*(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
|
||||
|
@ -581,7 +699,7 @@ static void *alloc_mmap(void *address){
|
|||
best = (BLASULONG)-1;
|
||||
best_address = map_address;
|
||||
|
||||
while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
|
||||
while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) {
|
||||
|
||||
current = run_bench(start, allocsize);
|
||||
|
||||
|
@ -597,7 +715,7 @@ static void *alloc_mmap(void *address){
|
|||
if ((BLASULONG)best_address > (BLASULONG)map_address)
|
||||
munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
|
||||
|
||||
munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address);
|
||||
munmap((void *)((BLASULONG)best_address + allocation_block_size), (SCALING - 1) * allocation_block_size + (BLASULONG)map_address - (BLASULONG)best_address);
|
||||
|
||||
map_address = best_address;
|
||||
|
||||
|
@ -610,17 +728,7 @@ static void *alloc_mmap(void *address){
|
|||
}
|
||||
#endif
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_mmap_free;
|
||||
release_pos ++;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
}
|
||||
STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
|
||||
|
||||
return map_address;
|
||||
}
|
||||
|
@ -632,9 +740,9 @@ static void *alloc_mmap(void *address){
|
|||
|
||||
#ifdef ALLOC_MALLOC
|
||||
|
||||
static void alloc_malloc_free(struct release_t *release){
|
||||
static void alloc_malloc_free(struct alloc_t *alloc_info){
|
||||
|
||||
free(release -> address);
|
||||
free(alloc_info);
|
||||
|
||||
}
|
||||
|
||||
|
@ -642,15 +750,11 @@ static void *alloc_malloc(void *address){
|
|||
|
||||
void *map_address;
|
||||
|
||||
map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
|
||||
map_address = (void *)malloc(allocation_block_size + FIXED_PAGESIZE);
|
||||
|
||||
if (map_address == (void *)NULL) map_address = (void *)-1;
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_malloc_free;
|
||||
release_pos ++;
|
||||
}
|
||||
STORE_RELEASE_FUNC(map_address, alloc_malloc_free);
|
||||
|
||||
return map_address;
|
||||
|
||||
|
@ -667,24 +771,20 @@ void *qfree (void *address);
|
|||
#define QCOMMS 0x2
|
||||
#define QFAST 0x4
|
||||
|
||||
static void alloc_qalloc_free(struct release_t *release){
|
||||
static void alloc_qalloc_free(struct alloc_t *alloc_info){
|
||||
|
||||
qfree(release -> address);
|
||||
qfree(alloc_info);
|
||||
|
||||
}
|
||||
|
||||
static void *alloc_qalloc(void *address){
|
||||
void *map_address;
|
||||
|
||||
map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE);
|
||||
map_address = (void *)qalloc(QCOMMS | QFAST, allocation_block_size + FIXED_PAGESIZE);
|
||||
|
||||
if (map_address == (void *)NULL) map_address = (void *)-1;
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_qalloc_free;
|
||||
release_pos ++;
|
||||
}
|
||||
STORE_RELEASE_FUNC(map_address, alloc_qalloc_free);
|
||||
|
||||
return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
|
||||
}
|
||||
|
@ -693,9 +793,9 @@ static void *alloc_qalloc(void *address){
|
|||
|
||||
#ifdef ALLOC_WINDOWS
|
||||
|
||||
static void alloc_windows_free(struct release_t *release){
|
||||
static void alloc_windows_free(struct alloc_t *alloc_info){
|
||||
|
||||
VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT);
|
||||
VirtualFree(alloc_info, allocation_block_size, MEM_DECOMMIT);
|
||||
|
||||
}
|
||||
|
||||
|
@ -703,17 +803,13 @@ static void *alloc_windows(void *address){
|
|||
void *map_address;
|
||||
|
||||
map_address = VirtualAlloc(address,
|
||||
BUFFER_SIZE,
|
||||
allocation_block_size,
|
||||
MEM_RESERVE | MEM_COMMIT,
|
||||
PAGE_READWRITE);
|
||||
|
||||
if (map_address == (void *)NULL) map_address = (void *)-1;
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_windows_free;
|
||||
release_pos ++;
|
||||
}
|
||||
STORE_RELEASE_FUNC(map_address, alloc_windows_free);
|
||||
|
||||
return map_address;
|
||||
}
|
||||
|
@ -725,13 +821,14 @@ static void *alloc_windows(void *address){
|
|||
#define DEVICEDRIVER_NAME "/dev/mapper"
|
||||
#endif
|
||||
|
||||
static void alloc_devicedirver_free(struct release_t *release){
|
||||
static void alloc_devicedirver_free(struct alloc_t *alloc_info){
|
||||
|
||||
if (munmap(release -> address, BUFFER_SIZE)) {
|
||||
int attr = alloc_info -> attr;
|
||||
if (munmap(address, allocation_block_size)) {
|
||||
printf("OpenBLAS : Bugphysarea unmap failed.\n");
|
||||
}
|
||||
|
||||
if (close(release -> attr)) {
|
||||
if (close(attr)) {
|
||||
printf("OpenBLAS : Bugphysarea close failed.\n");
|
||||
}
|
||||
|
||||
|
@ -748,17 +845,12 @@ static void *alloc_devicedirver(void *address){
|
|||
|
||||
}
|
||||
|
||||
map_address = mmap(address, BUFFER_SIZE,
|
||||
map_address = mmap(address, allocation_block_size,
|
||||
PROT_READ | PROT_WRITE,
|
||||
MAP_FILE | MAP_SHARED,
|
||||
fd, 0);
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].attr = fd;
|
||||
release_info[release_pos].func = alloc_devicedirver_free;
|
||||
release_pos ++;
|
||||
}
|
||||
STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_devicedirver_free, fd);
|
||||
|
||||
return map_address;
|
||||
}
|
||||
|
@ -767,9 +859,9 @@ static void *alloc_devicedirver(void *address){
|
|||
|
||||
#ifdef ALLOC_SHM
|
||||
|
||||
static void alloc_shm_free(struct release_t *release){
|
||||
static void alloc_shm_free(struct alloc_t *alloc_info){
|
||||
|
||||
if (shmdt(release -> address)) {
|
||||
if (shmdt(alloc_info)) {
|
||||
printf("OpenBLAS : Shared memory unmap failed.\n");
|
||||
}
|
||||
}
|
||||
|
@ -778,22 +870,21 @@ static void *alloc_shm(void *address){
|
|||
void *map_address;
|
||||
int shmid;
|
||||
|
||||
shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600);
|
||||
shmid = shmget(IPC_PRIVATE, allocation_block_size,IPC_CREAT | 0600);
|
||||
|
||||
map_address = (void *)shmat(shmid, address, 0);
|
||||
|
||||
if (map_address != (void *)-1){
|
||||
|
||||
#ifdef OS_LINUX
|
||||
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
|
||||
my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
|
||||
#endif
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].attr = shmid;
|
||||
release_info[release_pos].func = alloc_shm_free;
|
||||
release_pos ++;
|
||||
struct alloc_t *alloc_info = (struct alloc_t *)map_address;
|
||||
alloc_info->release_func = alloc_shm_free;
|
||||
alloc_info->attr = shmid;
|
||||
}
|
||||
|
||||
return map_address;
|
||||
|
@ -801,23 +892,23 @@ static void *alloc_shm(void *address){
|
|||
|
||||
#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
|
||||
|
||||
static void alloc_hugetlb_free(struct release_t *release){
|
||||
static void alloc_hugetlb_free(struct alloc_t *alloc_info){
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_AIX)
|
||||
if (shmdt(release -> address)) {
|
||||
if (shmdt(alloc_info)) {
|
||||
printf("OpenBLAS : Hugepage unmap failed.\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __sun__
|
||||
|
||||
munmap(release -> address, BUFFER_SIZE);
|
||||
munmap(alloc_info, allocation_block_size);
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
|
||||
VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT);
|
||||
VirtualFree(alloc_info, allocation_block_size, MEM_LARGE_PAGES | MEM_DECOMMIT);
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -830,7 +921,7 @@ static void *alloc_hugetlb(void *address){
|
|||
#if defined(OS_LINUX) || defined(OS_AIX)
|
||||
int shmid;
|
||||
|
||||
shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,
|
||||
shmid = shmget(IPC_PRIVATE, allocation_block_size,
|
||||
#ifdef OS_LINUX
|
||||
SHM_HUGETLB |
|
||||
#endif
|
||||
|
@ -843,7 +934,7 @@ static void *alloc_hugetlb(void *address){
|
|||
map_address = (void *)shmat(shmid, address, SHM_RND);
|
||||
|
||||
#ifdef OS_LINUX
|
||||
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
|
||||
my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
|
||||
#endif
|
||||
|
||||
if (map_address != (void *)-1){
|
||||
|
@ -860,7 +951,7 @@ static void *alloc_hugetlb(void *address){
|
|||
mha.mha_pagesize = HUGE_PAGESIZE;
|
||||
memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
|
||||
|
||||
map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE);
|
||||
map_address = (BLASULONG)memalign(HUGE_PAGESIZE, allocation_block_size);
|
||||
#endif
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
|
@ -872,7 +963,7 @@ static void *alloc_hugetlb(void *address){
|
|||
|
||||
tp.PrivilegeCount = 1;
|
||||
tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
|
||||
|
||||
|
||||
if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
|
||||
CloseHandle(hToken);
|
||||
return (void*)-1;
|
||||
|
@ -884,7 +975,7 @@ static void *alloc_hugetlb(void *address){
|
|||
}
|
||||
|
||||
map_address = (void *)VirtualAlloc(address,
|
||||
BUFFER_SIZE,
|
||||
allocation_block_size,
|
||||
MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
|
||||
PAGE_READWRITE);
|
||||
|
||||
|
@ -895,11 +986,7 @@ static void *alloc_hugetlb(void *address){
|
|||
|
||||
#endif
|
||||
|
||||
if (map_address != (void *)-1){
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_hugetlb_free;
|
||||
release_pos ++;
|
||||
}
|
||||
STORE_RELEASE_FUNC(map_address, alloc_hugetlb_free);
|
||||
|
||||
return map_address;
|
||||
}
|
||||
|
@ -911,13 +998,14 @@ static void *alloc_hugetlb(void *address){
|
|||
|
||||
static int hugetlb_pid = 0;
|
||||
|
||||
static void alloc_hugetlbfile_free(struct release_t *release){
|
||||
static void alloc_hugetlbfile_free(struct alloc_t *alloc_info){
|
||||
|
||||
if (munmap(release -> address, BUFFER_SIZE)) {
|
||||
int attr = alloc_info -> attr;
|
||||
if (munmap(alloc_info, allocation_block_size)) {
|
||||
printf("OpenBLAS : HugeTLBfs unmap failed.\n");
|
||||
}
|
||||
|
||||
if (close(release -> attr)) {
|
||||
if (close(attr)) {
|
||||
printf("OpenBLAS : HugeTLBfs close failed.\n");
|
||||
}
|
||||
}
|
||||
|
@ -938,17 +1026,12 @@ static void *alloc_hugetlbfile(void *address){
|
|||
|
||||
unlink(filename);
|
||||
|
||||
map_address = mmap(address, BUFFER_SIZE,
|
||||
map_address = mmap(address, allocation_block_size,
|
||||
PROT_READ | PROT_WRITE,
|
||||
MAP_SHARED,
|
||||
fd, 0);
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].attr = fd;
|
||||
release_info[release_pos].func = alloc_hugetlbfile_free;
|
||||
release_pos ++;
|
||||
}
|
||||
STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_hugetlbfile_free, fd);
|
||||
|
||||
return map_address;
|
||||
}
|
||||
|
@ -961,35 +1044,35 @@ static BLASULONG base_address = 0UL;
|
|||
static BLASULONG base_address = BASE_ADDRESS;
|
||||
#endif
|
||||
|
||||
static volatile struct {
|
||||
BLASULONG lock;
|
||||
void *addr;
|
||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
||||
int pos;
|
||||
#endif
|
||||
int used;
|
||||
#ifndef __64BIT__
|
||||
char dummy[48];
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
static _Atomic int memory_initialized = 0;
|
||||
#else
|
||||
char dummy[40];
|
||||
static volatile int memory_initialized = 0;
|
||||
#endif
|
||||
|
||||
} memory[NUM_BUFFERS];
|
||||
|
||||
static int memory_initialized = 0;
|
||||
|
||||
/* Memory allocation routine */
|
||||
/* procpos ... indicates where it comes from */
|
||||
/* 0 : Level 3 functions */
|
||||
/* 1 : Level 2 functions */
|
||||
/* 2 : Thread */
|
||||
|
||||
static void blas_memory_init(){
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
next_memory_table_pos = 0;
|
||||
# if !defined(HAS_COMPILER_TLS)
|
||||
# if defined(OS_WINDOWS)
|
||||
local_storage_key = ::TlsAlloc();
|
||||
# else
|
||||
pthread_key_create(&local_storage_key, NULL);
|
||||
# endif /* defined(OS_WINDOWS) */
|
||||
# endif /* defined(HAS_COMPILER_TLS) */
|
||||
#endif /* defined(SMP) && !defined(USE_OPENMP) */
|
||||
memset(local_memory_table, 0, sizeof(local_memory_table));
|
||||
}
|
||||
|
||||
void *blas_memory_alloc(int procpos){
|
||||
|
||||
int position;
|
||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
||||
int mypos;
|
||||
#endif
|
||||
|
||||
void *map_address;
|
||||
|
||||
|
@ -1019,103 +1102,54 @@ void *blas_memory_alloc(int procpos){
|
|||
NULL,
|
||||
};
|
||||
void *(**func)(void *address);
|
||||
struct alloc_t * alloc_info;
|
||||
struct alloc_t ** alloc_table;
|
||||
|
||||
#if defined(USE_OPENMP)
|
||||
if (!memory_initialized) {
|
||||
if (!LIKELY_ONE(memory_initialized)) {
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
/* Only allow a single thread to initialize memory system */
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
|
||||
if (!memory_initialized) {
|
||||
#endif
|
||||
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
|
||||
if (!memory_initialized) {
|
||||
|
||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
||||
for (position = 0; position < NUM_BUFFERS; position ++){
|
||||
memory[position].addr = (void *)0;
|
||||
memory[position].pos = -1;
|
||||
memory[position].used = 0;
|
||||
memory[position].lock = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
blas_memory_init();
|
||||
#ifdef DYNAMIC_ARCH
|
||||
gotoblas_dynamic_init();
|
||||
gotoblas_dynamic_init();
|
||||
#endif
|
||||
|
||||
#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
|
||||
gotoblas_affinity_init();
|
||||
gotoblas_affinity_init();
|
||||
#endif
|
||||
|
||||
#ifdef SMP
|
||||
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
|
||||
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
|
||||
#endif
|
||||
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
|
||||
#ifndef DYNAMIC_ARCH
|
||||
blas_set_parameter();
|
||||
blas_set_parameter();
|
||||
#endif
|
||||
#endif
|
||||
|
||||
memory_initialized = 1;
|
||||
memory_initialized = 1;
|
||||
|
||||
}
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#if defined(USE_OPENMP)
|
||||
}
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
}
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("Alloc Start ...\n");
|
||||
#endif
|
||||
|
||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
||||
|
||||
mypos = WhereAmI();
|
||||
|
||||
position = mypos;
|
||||
while (position >= NUM_BUFFERS) position >>= 1;
|
||||
|
||||
do {
|
||||
if (!memory[position].used && (memory[position].pos == mypos)) {
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#else
|
||||
blas_lock(&memory[position].lock);
|
||||
#endif
|
||||
if (!memory[position].used) goto allocation;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#else
|
||||
blas_unlock(&memory[position].lock);
|
||||
#endif
|
||||
}
|
||||
|
||||
position ++;
|
||||
|
||||
} while (position < NUM_BUFFERS);
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
position = 0;
|
||||
|
||||
alloc_table = get_memory_table();
|
||||
do {
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#else
|
||||
if (!memory[position].used) {
|
||||
blas_lock(&memory[position].lock);
|
||||
#endif
|
||||
if (!memory[position].used) goto allocation;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#else
|
||||
blas_unlock(&memory[position].lock);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!alloc_table[position] || !alloc_table[position]->used) goto allocation;
|
||||
position ++;
|
||||
|
||||
} while (position < NUM_BUFFERS);
|
||||
} while (position < BUFFERS_PER_THREAD);
|
||||
|
||||
goto error;
|
||||
|
||||
|
@ -1125,14 +1159,8 @@ void *blas_memory_alloc(int procpos){
|
|||
printf(" Position -> %d\n", position);
|
||||
#endif
|
||||
|
||||
memory[position].used = 1;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#else
|
||||
blas_unlock(&memory[position].lock);
|
||||
#endif
|
||||
|
||||
if (!memory[position].addr) {
|
||||
alloc_info = alloc_table[position];
|
||||
if (!alloc_info) {
|
||||
do {
|
||||
#ifdef DEBUG
|
||||
printf("Allocation Start : %lx\n", base_address);
|
||||
|
@ -1144,18 +1172,18 @@ void *blas_memory_alloc(int procpos){
|
|||
|
||||
while ((func != NULL) && (map_address == (void *) -1)) {
|
||||
|
||||
map_address = (*func)((void *)base_address);
|
||||
map_address = (*func)((void *)base_address);
|
||||
|
||||
#ifdef ALLOC_DEVICEDRIVER
|
||||
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
|
||||
fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
|
||||
fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ALLOC_HUGETLBFILE
|
||||
if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
|
||||
#ifndef OS_WINDOWS
|
||||
fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
|
||||
fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n");
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
@ -1172,89 +1200,44 @@ void *blas_memory_alloc(int procpos){
|
|||
#endif
|
||||
if (((BLASLONG) map_address) == -1) base_address = 0UL;
|
||||
|
||||
if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
|
||||
if (base_address) base_address += allocation_block_size + FIXED_PAGESIZE;
|
||||
|
||||
} while ((BLASLONG)map_address == -1);
|
||||
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
memory[position].addr = map_address;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
alloc_table[position] = alloc_info = map_address;
|
||||
|
||||
#ifdef DEBUG
|
||||
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
|
||||
printf(" Mapping Succeeded. %p(%d)\n", (void *)alloc_info, position);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
||||
|
||||
if (memory[position].pos == -1) memory[position].pos = mypos;
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef DYNAMIC_ARCH
|
||||
|
||||
if (memory_initialized == 1) {
|
||||
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
|
||||
if (memory_initialized == 1) {
|
||||
|
||||
if (!gotoblas) gotoblas_dynamic_init();
|
||||
|
||||
memory_initialized = 2;
|
||||
}
|
||||
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("Mapped : %p %3d\n\n",
|
||||
(void *)memory[position].addr, position);
|
||||
printf("Mapped : %p %3d\n\n", (void *)alloc_info, position);
|
||||
#endif
|
||||
|
||||
return (void *)memory[position].addr;
|
||||
alloc_info->used = 1;
|
||||
|
||||
return (void *)(((char *)alloc_info) + sizeof(struct alloc_t));
|
||||
|
||||
error:
|
||||
printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
|
||||
printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void blas_memory_free(void *free_area){
|
||||
|
||||
void blas_memory_free(void *buffer){
|
||||
#ifdef DEBUG
|
||||
int position;
|
||||
struct alloc_t ** alloc_table;
|
||||
#endif
|
||||
/* Since we passed an offset pointer to the caller, get back to the actual allocation */
|
||||
struct alloc_t *alloc_info = (void *)(((char *)buffer) - sizeof(struct alloc_t));
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("Unmapped Start : %p ...\n", free_area);
|
||||
printf("Unmapped Start : %p ...\n", alloc_info);
|
||||
#endif
|
||||
|
||||
position = 0;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
|
||||
position++;
|
||||
|
||||
if (memory[position].addr != free_area) goto error;
|
||||
|
||||
#ifdef DEBUG
|
||||
printf(" Position : %d\n", position);
|
||||
#endif
|
||||
|
||||
// arm: ensure all writes are finished before other thread takes this memory
|
||||
WMB;
|
||||
|
||||
memory[position].used = 0;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
alloc_info->used = 0;
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("Unmap Succeeded.\n\n");
|
||||
|
@ -1262,15 +1245,13 @@ void blas_memory_free(void *free_area){
|
|||
|
||||
return;
|
||||
|
||||
error:
|
||||
printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
|
||||
|
||||
#ifdef DEBUG
|
||||
for (position = 0; position < NUM_BUFFERS; position++)
|
||||
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
|
||||
#endif
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
alloc_table = get_memory_table();
|
||||
for (position = 0; position < BUFFERS_PER_THREAD; position++){
|
||||
if (alloc_table[position]) {
|
||||
printf("%4ld %p : %d\n", position, alloc_table[position], alloc_table[position]->used);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
@ -1287,16 +1268,20 @@ void blas_memory_free_nolock(void * map_address) {
|
|||
|
||||
void blas_shutdown(void){
|
||||
|
||||
int pos;
|
||||
int pos, thread;
|
||||
|
||||
#ifdef SMP
|
||||
BLASFUNC(blas_thread_shutdown)();
|
||||
#endif
|
||||
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
|
||||
for (pos = 0; pos < release_pos; pos ++) {
|
||||
release_info[pos].func(&release_info[pos]);
|
||||
for (thread = 0; thread < MAX_ALLOCATING_THREADS; thread ++){
|
||||
for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){
|
||||
struct alloc_t *alloc_info = local_memory_table[thread][pos];
|
||||
if (alloc_info) {
|
||||
alloc_info->release_func(alloc_info);
|
||||
alloc_info = (void *)0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef SEEK_ADDRESS
|
||||
|
@ -1305,17 +1290,6 @@ void blas_shutdown(void){
|
|||
base_address = BASE_ADDRESS;
|
||||
#endif
|
||||
|
||||
for (pos = 0; pos < NUM_BUFFERS; pos ++){
|
||||
memory[pos].addr = (void *)0;
|
||||
memory[pos].used = 0;
|
||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
||||
memory[pos].pos = -1;
|
||||
#endif
|
||||
memory[pos].lock = 0;
|
||||
}
|
||||
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1339,7 +1313,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
|
|||
size_t size;
|
||||
BLASULONG buffer;
|
||||
|
||||
size = BUFFER_SIZE - PAGESIZE;
|
||||
size = allocation_block_size - PAGESIZE;
|
||||
buffer = (BLASULONG)sa + GEMM_OFFSET_A;
|
||||
|
||||
#if defined(OS_LINUX) && !defined(NO_WARMUP)
|
||||
|
@ -1360,7 +1334,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
|
|||
UNLOCK_COMMAND(&init_lock);
|
||||
#endif
|
||||
|
||||
size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE);
|
||||
size = MIN((allocation_block_size - PAGESIZE), L2_SIZE);
|
||||
buffer = (BLASULONG)sa + GEMM_OFFSET_A;
|
||||
|
||||
while (size > 0) {
|
||||
|
|
|
@ -260,7 +260,7 @@ HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \
|
|||
idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX)
|
||||
|
||||
CSBLAS1OBJS = \
|
||||
cblas_isamax.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
|
||||
cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
|
||||
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
|
||||
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
|
||||
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX)
|
||||
|
@ -277,7 +277,7 @@ CSBLAS3OBJS = \
|
|||
cblas_sgeadd.$(SUFFIX)
|
||||
|
||||
CDBLAS1OBJS = \
|
||||
cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
|
||||
cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
|
||||
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
|
||||
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
|
||||
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX)
|
||||
|
@ -294,7 +294,7 @@ CDBLAS3OBJS += \
|
|||
cblas_dgeadd.$(SUFFIX)
|
||||
|
||||
CCBLAS1OBJS = \
|
||||
cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
|
||||
cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
|
||||
cblas_ccopy.$(SUFFIX) \
|
||||
cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \
|
||||
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
|
||||
|
@ -320,7 +320,7 @@ CCBLAS3OBJS = \
|
|||
|
||||
|
||||
CZBLAS1OBJS = \
|
||||
cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
|
||||
cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
|
||||
cblas_zcopy.$(SUFFIX) \
|
||||
cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
|
||||
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
|
||||
|
@ -1359,6 +1359,18 @@ cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c
|
|||
cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_isamin.$(SUFFIX) cblas_isamin.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_idamin.$(SUFFIX) cblas_idamin.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_icamin.$(SUFFIX) cblas_icamin.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_izamin.$(SUFFIX) cblas_izamin.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||
|
||||
|
|
4
param.h
4
param.h
|
@ -1507,7 +1507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define SYMV_P 8
|
||||
|
||||
#define SWITCH_RATIO 4
|
||||
#define SWITCH_RATIO 32
|
||||
|
||||
#ifdef ARCH_X86
|
||||
|
||||
|
@ -1626,7 +1626,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define SYMV_P 8
|
||||
|
||||
#define SWITCH_RATIO 4
|
||||
#define SWITCH_RATIO 32
|
||||
|
||||
#ifdef ARCH_X86
|
||||
|
||||
|
|
|
@ -122,8 +122,13 @@ endif
|
|||
|
||||
|
||||
FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
|
||||
CEXTRALIB =
|
||||
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
CEXTRALIB = -lomp
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME)
|
||||
$(FC) $(FLDFLAGS) -o sblat1 sblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
|
||||
|
|
|
@ -25,7 +25,6 @@ endif ()
|
|||
|
||||
# known to hang with the native Windows and Android threads
|
||||
# FIXME needs checking if this works on any of the other platforms
|
||||
if (NOT NO_CBLAS)
|
||||
if (NOT USE_OPENMP)
|
||||
if (OS_CYGWIN_NT OR OS_LINUX)
|
||||
set(OpenBLAS_utest_src
|
||||
|
@ -34,7 +33,6 @@ set(OpenBLAS_utest_src
|
|||
)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (NOT NO_LAPACK)
|
||||
set(OpenBLAS_utest_src
|
||||
|
|
|
@ -17,13 +17,11 @@ endif
|
|||
|
||||
#this does not work with OpenMP nor with native Windows or Android threads
|
||||
# FIXME TBD if this works on OSX, SunOS, POWER and zarch
|
||||
ifneq ($(NO_CBLAS), 1)
|
||||
ifndef USE_OPENMP
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT))
|
||||
OBJS += test_fork.o
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
all : run_test
|
||||
|
||||
|
|
|
@ -13,9 +13,9 @@ met:
|
|||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
|
@ -48,11 +48,13 @@ void* xmalloc(size_t n)
|
|||
}
|
||||
}
|
||||
|
||||
void check_dgemm(double *a, double *b, double *result, double *expected, int n)
|
||||
void check_dgemm(double *a, double *b, double *result, double *expected, blasint n)
|
||||
{
|
||||
char trans1 = 'T';
|
||||
char trans2 = 'N';
|
||||
double zerod = 0, oned = 1;
|
||||
int i;
|
||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n,
|
||||
1.0, a, n, b, n, 0.0, result, n);
|
||||
BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, result, &n);
|
||||
for(i = 0; i < n * n; ++i) {
|
||||
ASSERT_DBL_NEAR_TOL(expected[i], result[i], DOUBLE_EPS);
|
||||
}
|
||||
|
@ -60,7 +62,7 @@ void check_dgemm(double *a, double *b, double *result, double *expected, int n)
|
|||
|
||||
CTEST(fork, safety)
|
||||
{
|
||||
int n = 1000;
|
||||
blasint n = 1000;
|
||||
int i;
|
||||
|
||||
double *a, *b, *c, *d;
|
||||
|
@ -84,8 +86,10 @@ CTEST(fork, safety)
|
|||
|
||||
// Compute a DGEMM product in the parent process prior to forking to
|
||||
// ensure that the OpenBLAS thread pool is initialized.
|
||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n,
|
||||
1.0, a, n, b, n, 0.0, c, n);
|
||||
char trans1 = 'T';
|
||||
char trans2 = 'N';
|
||||
double zerod = 0, oned = 1;
|
||||
BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, c, &n);
|
||||
|
||||
fork_pid = fork();
|
||||
if (fork_pid == -1) {
|
||||
|
|
Loading…
Reference in New Issue