Files
OpenBLAS/driver/others/memory.c
Craig Donner bf40f806ef Remove the need for most locking in memory.c.
Using thread local storage for tracking memory allocations means that threads
no longer have to lock at all when doing memory allocations / frees. This
particularly helps the gemm driver since it does an allocation per invocation.
Even without threading at all, this helps, since even calling a lock with
no contention has a cost:

Before this change, no threading:
```
----------------------------------------------------
Benchmark             Time           CPU Iterations
----------------------------------------------------
BM_SGEMM/4          102 ns        102 ns   13504412
BM_SGEMM/6          175 ns        175 ns    7997580
BM_SGEMM/8          205 ns        205 ns    6842073
BM_SGEMM/10         266 ns        266 ns    5294919
BM_SGEMM/16         478 ns        478 ns    2963441
BM_SGEMM/20         690 ns        690 ns    2144755
BM_SGEMM/32        1906 ns       1906 ns     716981
BM_SGEMM/40        2983 ns       2983 ns     473218
BM_SGEMM/64        9421 ns       9422 ns     148450
BM_SGEMM/72       12630 ns      12631 ns     112105
BM_SGEMM/80       15845 ns      15846 ns      89118
BM_SGEMM/90       25675 ns      25676 ns      54332
BM_SGEMM/100      29864 ns      29865 ns      47120
BM_SGEMM/112      37841 ns      37842 ns      36717
BM_SGEMM/128      56531 ns      56532 ns      25361
BM_SGEMM/140      75886 ns      75888 ns      18143
BM_SGEMM/150      98493 ns      98496 ns      14299
BM_SGEMM/160     102620 ns     102622 ns      13381
BM_SGEMM/170     135169 ns     135173 ns      10231
BM_SGEMM/180     146170 ns     146172 ns       9535
BM_SGEMM/189     190226 ns     190231 ns       7397
BM_SGEMM/200     194513 ns     194519 ns       7210
BM_SGEMM/256     396561 ns     396573 ns       3531
```
with this change:
```
----------------------------------------------------
Benchmark             Time           CPU Iterations
----------------------------------------------------
BM_SGEMM/4           95 ns         95 ns   14500387
BM_SGEMM/6          166 ns        166 ns    8381763
BM_SGEMM/8          196 ns        196 ns    7277044
BM_SGEMM/10         256 ns        256 ns    5515721
BM_SGEMM/16         463 ns        463 ns    3025197
BM_SGEMM/20         636 ns        636 ns    2070213
BM_SGEMM/32        1885 ns       1885 ns     739444
BM_SGEMM/40        2969 ns       2969 ns     472152
BM_SGEMM/64        9371 ns       9372 ns     148932
BM_SGEMM/72       12431 ns      12431 ns     112919
BM_SGEMM/80       15615 ns      15616 ns      89978
BM_SGEMM/90       25397 ns      25398 ns      55041
BM_SGEMM/100      29445 ns      29446 ns      47540
BM_SGEMM/112      37530 ns      37531 ns      37286
BM_SGEMM/128      55373 ns      55375 ns      25277
BM_SGEMM/140      76241 ns      76241 ns      18259
BM_SGEMM/150     102196 ns     102200 ns      13736
BM_SGEMM/160     101521 ns     101525 ns      13556
BM_SGEMM/170     136182 ns     136184 ns      10567
BM_SGEMM/180     146861 ns     146864 ns       9035
BM_SGEMM/189     192632 ns     192632 ns       7231
BM_SGEMM/200     198547 ns     198555 ns       6995
BM_SGEMM/256     392316 ns     392330 ns       3539
```

Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost
of small matrix operations was overshadowed by thread locking (look smaller than
32) even when not explicitly spawning threads:
```
----------------------------------------------------
Benchmark             Time           CPU Iterations
----------------------------------------------------
BM_SGEMM/4          328 ns        328 ns    4170562
BM_SGEMM/6          396 ns        396 ns    3536400
BM_SGEMM/8          418 ns        418 ns    3330102
BM_SGEMM/10         491 ns        491 ns    2863047
BM_SGEMM/16         710 ns        710 ns    2028314
BM_SGEMM/20         871 ns        871 ns    1581546
BM_SGEMM/32        2132 ns       2132 ns     657089
BM_SGEMM/40        3197 ns       3196 ns     437969
BM_SGEMM/64        9645 ns       9645 ns     144987
BM_SGEMM/72       35064 ns      32881 ns      50264
BM_SGEMM/80       37661 ns      35787 ns      42080
BM_SGEMM/90       36507 ns      36077 ns      40091
BM_SGEMM/100      32513 ns      31850 ns      48607
BM_SGEMM/112      41742 ns      41207 ns      37273
BM_SGEMM/128      67211 ns      65095 ns      21933
BM_SGEMM/140      68263 ns      67943 ns      19245
BM_SGEMM/150     121854 ns     115439 ns      10660
BM_SGEMM/160     116826 ns     115539 ns      10000
BM_SGEMM/170     126566 ns     122798 ns      11960
BM_SGEMM/180     130088 ns     127292 ns      11503
BM_SGEMM/189     120309 ns     116634 ns      13162
BM_SGEMM/200     114559 ns     110993 ns      10000
BM_SGEMM/256     217063 ns     207806 ns       6417
```
and after, it's gone (note this includes my other change which reduces calls
to num_cpu_avail):
```
----------------------------------------------------
Benchmark             Time           CPU Iterations
----------------------------------------------------
BM_SGEMM/4           95 ns         95 ns   12347650
BM_SGEMM/6          166 ns        166 ns    8259683
BM_SGEMM/8          193 ns        193 ns    7162210
BM_SGEMM/10         258 ns        258 ns    5415657
BM_SGEMM/16         471 ns        471 ns    2981009
BM_SGEMM/20         666 ns        666 ns    2148002
BM_SGEMM/32        1903 ns       1903 ns     738245
BM_SGEMM/40        2969 ns       2969 ns     473239
BM_SGEMM/64        9440 ns       9440 ns     148442
BM_SGEMM/72       37239 ns      33330 ns      46813
BM_SGEMM/80       57350 ns      55949 ns      32251
BM_SGEMM/90       36275 ns      36249 ns      42259
BM_SGEMM/100      31111 ns      31008 ns      45270
BM_SGEMM/112      43782 ns      40912 ns      34749
BM_SGEMM/128      67375 ns      64406 ns      22443
BM_SGEMM/140      76389 ns      67003 ns      21430
BM_SGEMM/150      72952 ns      71830 ns      19793
BM_SGEMM/160      97039 ns      96858 ns      11498
BM_SGEMM/170     123272 ns     122007 ns      11855
BM_SGEMM/180     126828 ns     126505 ns      11567
BM_SGEMM/189     115179 ns     114665 ns      11044
BM_SGEMM/200      89289 ns      87259 ns      16147
BM_SGEMM/256     226252 ns     222677 ns       7375
```

I've also tested this with ThreadSanitizer and found no data races during
execution.  I'm not sure why 200 is always faster than it's neighbors, we must
be hitting some optimal cache size or something.
2018-06-14 16:54:58 +01:00

1489 lines
34 KiB
C

/*****************************************************************************
Copyright (c) 2011-2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
//#undef DEBUG
#include "common.h"
#include <errno.h>
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
#define ALLOC_WINDOWS
#ifndef MEM_LARGE_PAGES
#define MEM_LARGE_PAGES 0x20000000
#endif
#else
#define ALLOC_MMAP
#define ALLOC_MALLOC
#endif
#include <stdlib.h>
#include <stdio.h>
#include <fcntl.h>
#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
#include <sys/mman.h>
#ifndef NO_SYSV_IPC
#include <sys/shm.h>
#endif
#include <sys/ipc.h>
#endif
#include <sys/types.h>
#ifdef OS_LINUX
#include <sys/sysinfo.h>
#include <sched.h>
#include <errno.h>
#include <linux/unistd.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <sys/resource.h>
#endif
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
#include <sys/sysctl.h>
#include <sys/resource.h>
#endif
#if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
#include <conio.h>
#undef printf
#define printf _cprintf
#endif
#ifdef OS_LINUX
#ifndef MPOL_PREFERRED
#define MPOL_PREFERRED 1
#endif
#endif
#if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP)
#define NO_WARMUP
#endif
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
#ifndef FIXED_PAGESIZE
#define FIXED_PAGESIZE 4096
#endif
#ifndef BUFFERS_PER_THREAD
#ifdef USE_OPENMP
#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
#else
#define BUFFERS_PER_THREAD NUM_BUFFERS
#endif
#endif
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
#if defined(_MSC_VER) && !defined(__clang__)
#define CONSTRUCTOR __cdecl
#define DESTRUCTOR __cdecl
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
#define CONSTRUCTOR __attribute__ ((constructor(101)))
#define DESTRUCTOR __attribute__ ((destructor(101)))
#else
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#endif
#ifdef DYNAMIC_ARCH
gotoblas_t *gotoblas = NULL;
#endif
extern void openblas_warning(int verbose, const char * msg);
#ifndef SMP
#define blas_cpu_number 1
#define blas_num_threads 1
/* Dummy Function */
int goto_get_num_procs (void) { return 1;};
void goto_set_num_threads(int num_threads) {};
#else
#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
#ifndef NO_AFFINITY
int get_num_procs(void);
#else
int get_num_procs(void) {
static int nums = 0;
cpu_set_t *cpusetp;
size_t size;
int ret;
int i,n;
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
#if !defined(OS_LINUX)
return nums;
#endif
#if !defined(__GLIBC_PREREQ)
return nums;
#else
#if !__GLIBC_PREREQ(2, 3)
return nums;
#endif
#if !__GLIBC_PREREQ(2, 7)
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
if (ret!=0) return nums;
n=0;
#if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpusetp)) n++;
nums=n;
#else
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
#endif
return nums;
#else
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) return nums;
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) return nums;
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
#endif
#endif
}
#endif
#endif
#ifdef OS_ANDROID
int get_num_procs(void) {
static int nums = 0;
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
return nums;
}
#endif
#ifdef OS_WINDOWS
int get_num_procs(void) {
static int nums = 0;
if (nums == 0) {
SYSTEM_INFO sysinfo;
GetSystemInfo(&sysinfo);
nums = sysinfo.dwNumberOfProcessors;
}
return nums;
}
#endif
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
int get_num_procs(void) {
static int nums = 0;
int m[2];
size_t len;
if (nums == 0) {
m[0] = CTL_HW;
m[1] = HW_NCPU;
len = sizeof(int);
sysctl(m, 2, &nums, &len, NULL, 0);
}
return nums;
}
#endif
#if defined(OS_DARWIN)
int get_num_procs(void) {
static int nums = 0;
size_t len;
if (nums == 0){
len = sizeof(int);
sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
}
return nums;
}
/*
void set_stack_limit(int limitMB){
int result=0;
struct rlimit rl;
rlim_t StackSize;
StackSize=limitMB*1024*1024;
result=getrlimit(RLIMIT_STACK, &rl);
if(result==0){
if(rl.rlim_cur < StackSize){
rl.rlim_cur=StackSize;
result=setrlimit(RLIMIT_STACK, &rl);
if(result !=0){
fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
}
}
}
}
*/
#endif
/*
OpenBLAS uses the numbers of CPU cores in multithreading.
It can be set by openblas_set_num_threads(int num_threads);
*/
int blas_cpu_number = 0;
/*
The numbers of threads in the thread pool.
This value is equal or large than blas_cpu_number. This means some threads are sleep.
*/
int blas_num_threads = 0;
int goto_get_num_procs (void) {
return blas_cpu_number;
}
void openblas_fork_handler()
{
// This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
// built with "make USE_OPENMP=0".
// Hanging can still happen when OpenBLAS is built against the libgomp
// implementation of OpenMP. The problem is tracked at:
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
// In the mean time build with USE_OPENMP=0 or link against another
// implementation of OpenMP.
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
int err;
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
if(err != 0)
openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
#endif
}
extern int openblas_num_threads_env();
extern int openblas_goto_num_threads_env();
extern int openblas_omp_num_threads_env();
int blas_get_cpu_number(void){
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
int max_num;
#endif
int blas_goto_num = 0;
int blas_omp_num = 0;
if (blas_num_threads) return blas_num_threads;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
max_num = get_num_procs();
#endif
// blas_goto_num = 0;
#ifndef USE_OPENMP
blas_goto_num=openblas_num_threads_env();
if (blas_goto_num < 0) blas_goto_num = 0;
if (blas_goto_num == 0) {
blas_goto_num=openblas_goto_num_threads_env();
if (blas_goto_num < 0) blas_goto_num = 0;
}
#endif
// blas_omp_num = 0;
blas_omp_num=openblas_omp_num_threads_env();
if (blas_omp_num < 0) blas_omp_num = 0;
if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
else blas_num_threads = MAX_CPU_NUMBER;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
if (blas_num_threads > max_num) blas_num_threads = max_num;
#endif
if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
#ifdef DEBUG
printf( "Adjusted number of threads : %3d\n", blas_num_threads);
#endif
blas_cpu_number = blas_num_threads;
return blas_num_threads;
}
#endif
int openblas_get_num_procs(void) {
#ifndef SMP
return 1;
#else
return get_num_procs();
#endif
}
int openblas_get_num_threads(void) {
#ifndef SMP
return 1;
#else
// init blas_cpu_number if needed
blas_get_cpu_number();
return blas_cpu_number;
#endif
}
struct release_t {
void *address;
void (*func)(struct release_t *);
long attr;
};
int hugetlb_allocated = 0;
#if defined(OS_WINDOWS)
#define THREAD_LOCAL __declspec(thread)
#define UNLIKELY_TO_BE_ZERO(x) (x)
#else
#define THREAD_LOCAL __thread
#define UNLIKELY_TO_BE_ZERO(x) (__builtin_expect(x, 0))
#endif
static struct release_t THREAD_LOCAL release_info[BUFFERS_PER_THREAD];
static int THREAD_LOCAL release_pos = 0;
#if defined(OS_LINUX) && !defined(NO_WARMUP)
static int hot_alloc = 0;
#endif
/* Global lock for memory allocation */
#if defined(USE_PTHREAD_LOCK)
static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
#elif defined(USE_PTHREAD_SPINLOCK)
static pthread_spinlock_t alloc_lock = 0;
#else
static BLASULONG alloc_lock = 0UL;
#endif
#ifdef ALLOC_MMAP
static void alloc_mmap_free(struct release_t *release){
if (munmap(release -> address, BUFFER_SIZE)) {
printf("OpenBLAS : munmap failed\n");
}
}
#ifdef NO_WARMUP
static void *alloc_mmap(void *address){
void *map_address;
if (address){
map_address = mmap(address,
BUFFER_SIZE,
MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
} else {
map_address = mmap(address,
BUFFER_SIZE,
MMAP_ACCESS, MMAP_POLICY, -1, 0);
}
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
}
#ifdef OS_LINUX
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
#endif
return map_address;
}
#else
#define BENCH_ITERATION 4
#define SCALING 2
static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
BLASULONG original, *p;
BLASULONG start, stop, min;
int iter, i, count;
min = (BLASULONG)-1;
original = *(BLASULONG *)(address + size - PAGESIZE);
*(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address;
for (iter = 0; iter < BENCH_ITERATION; iter ++ ) {
p = (BLASULONG *)address;
count = size / PAGESIZE;
start = rpcc();
for (i = 0; i < count; i ++) {
p = (BLASULONG *)(*p);
}
stop = rpcc();
if (min > stop - start) min = stop - start;
}
*(BLASULONG *)(address + size - PAGESIZE + 0) = original;
*(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p;
return min;
}
static void *alloc_mmap(void *address){
void *map_address, *best_address;
BLASULONG best, start, current;
BLASULONG allocsize;
if (address){
/* Just give up use advanced operation */
map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
#ifdef OS_LINUX
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
#endif
} else {
#if defined(OS_LINUX) && !defined(NO_WARMUP)
if (hot_alloc == 0) {
map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0);
#ifdef OS_LINUX
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
#endif
} else {
#endif
map_address = mmap(NULL, BUFFER_SIZE * SCALING,
MMAP_ACCESS, MMAP_POLICY, -1, 0);
if (map_address != (void *)-1) {
#ifdef OS_LINUX
#ifdef DEBUG
int ret=0;
ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
if(ret==-1){
int errsv=errno;
perror("OpenBLAS alloc_mmap:");
printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
}
#else
my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
#endif
#endif
allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
start = (BLASULONG)map_address;
current = (SCALING - 1) * BUFFER_SIZE;
while(current > 0) {
*(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
start += PAGESIZE;
current -= PAGESIZE;
}
*(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
start = (BLASULONG)map_address;
best = (BLASULONG)-1;
best_address = map_address;
while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
current = run_bench(start, allocsize);
if (best > current) {
best = current;
best_address = (void *)start;
}
start += PAGESIZE;
}
if ((BLASULONG)best_address > (BLASULONG)map_address)
munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address);
map_address = best_address;
#if defined(OS_LINUX) && !defined(NO_WARMUP)
hot_alloc = 2;
#endif
}
}
#if defined(OS_LINUX) && !defined(NO_WARMUP)
}
#endif
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
}
return map_address;
}
#endif
#endif
#ifdef ALLOC_MALLOC
static void alloc_malloc_free(struct release_t *release){
free(release -> address);
}
static void *alloc_malloc(void *address){
void *map_address;
map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
if (map_address == (void *)NULL) map_address = (void *)-1;
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_malloc_free;
release_pos ++;
}
return map_address;
}
#endif
#ifdef ALLOC_QALLOC
void *qalloc(int flags, size_t bytes);
void *qfree (void *address);
#define QNONCACHE 0x1
#define QCOMMS 0x2
#define QFAST 0x4
static void alloc_qalloc_free(struct release_t *release){
qfree(release -> address);
}
static void *alloc_qalloc(void *address){
void *map_address;
map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE);
if (map_address == (void *)NULL) map_address = (void *)-1;
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_qalloc_free;
release_pos ++;
}
return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
}
#endif
#ifdef ALLOC_WINDOWS
static void alloc_windows_free(struct release_t *release){
VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT);
}
static void *alloc_windows(void *address){
void *map_address;
map_address = VirtualAlloc(address,
BUFFER_SIZE,
MEM_RESERVE | MEM_COMMIT,
PAGE_READWRITE);
if (map_address == (void *)NULL) map_address = (void *)-1;
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_windows_free;
release_pos ++;
}
return map_address;
}
#endif
#ifdef ALLOC_DEVICEDRIVER
#ifndef DEVICEDRIVER_NAME
#define DEVICEDRIVER_NAME "/dev/mapper"
#endif
static void alloc_devicedirver_free(struct release_t *release){
if (munmap(release -> address, BUFFER_SIZE)) {
printf("OpenBLAS : Bugphysarea unmap failed.\n");
}
if (close(release -> attr)) {
printf("OpenBLAS : Bugphysarea close failed.\n");
}
}
static void *alloc_devicedirver(void *address){
int fd;
void *map_address;
if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) {
return (void *)-1;
}
map_address = mmap(address, BUFFER_SIZE,
PROT_READ | PROT_WRITE,
MAP_FILE | MAP_SHARED,
fd, 0);
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].attr = fd;
release_info[release_pos].func = alloc_devicedirver_free;
release_pos ++;
}
return map_address;
}
#endif
#ifdef ALLOC_SHM
static void alloc_shm_free(struct release_t *release){
if (shmdt(release -> address)) {
printf("OpenBLAS : Shared memory unmap failed.\n");
}
}
static void *alloc_shm(void *address){
void *map_address;
int shmid;
shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600);
map_address = (void *)shmat(shmid, address, 0);
if (map_address != (void *)-1){
#ifdef OS_LINUX
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
#endif
shmctl(shmid, IPC_RMID, 0);
release_info[release_pos].address = map_address;
release_info[release_pos].attr = shmid;
release_info[release_pos].func = alloc_shm_free;
release_pos ++;
}
return map_address;
}
#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
static void alloc_hugetlb_free(struct release_t *release){
#if defined(OS_LINUX) || defined(OS_AIX)
if (shmdt(release -> address)) {
printf("OpenBLAS : Hugepage unmap failed.\n");
}
#endif
#ifdef __sun__
munmap(release -> address, BUFFER_SIZE);
#endif
#ifdef OS_WINDOWS
VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT);
#endif
}
static void *alloc_hugetlb(void *address){
void *map_address = (void *)-1;
#if defined(OS_LINUX) || defined(OS_AIX)
int shmid;
shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,
#ifdef OS_LINUX
SHM_HUGETLB |
#endif
#ifdef OS_AIX
SHM_LGPAGE | SHM_PIN |
#endif
IPC_CREAT | SHM_R | SHM_W);
if (shmid != -1) {
map_address = (void *)shmat(shmid, address, SHM_RND);
#ifdef OS_LINUX
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
#endif
if (map_address != (void *)-1){
shmctl(shmid, IPC_RMID, 0);
}
}
#endif
#ifdef __sun__
struct memcntl_mha mha;
mha.mha_cmd = MHA_MAPSIZE_BSSBRK;
mha.mha_flags = 0;
mha.mha_pagesize = HUGE_PAGESIZE;
memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE);
#endif
#ifdef OS_WINDOWS
HANDLE hToken;
TOKEN_PRIVILEGES tp;
if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1;
tp.PrivilegeCount = 1;
tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
CloseHandle(hToken);
return (void*)-1;
}
if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) {
CloseHandle(hToken);
return (void*)-1;
}
map_address = (void *)VirtualAlloc(address,
BUFFER_SIZE,
MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
PAGE_READWRITE);
tp.Privileges[0].Attributes = 0;
AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL);
if (map_address == (void *)NULL) map_address = (void *)-1;
#endif
if (map_address != (void *)-1){
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_hugetlb_free;
release_pos ++;
}
return map_address;
}
#endif
#endif
#ifdef ALLOC_HUGETLBFILE
static int hugetlb_pid = 0;
static void alloc_hugetlbfile_free(struct release_t *release){
if (munmap(release -> address, BUFFER_SIZE)) {
printf("OpenBLAS : HugeTLBfs unmap failed.\n");
}
if (close(release -> attr)) {
printf("OpenBLAS : HugeTLBfs close failed.\n");
}
}
static void *alloc_hugetlbfile(void *address){
void *map_address = (void *)-1;
int fd;
char filename[64];
if (!hugetlb_pid) hugetlb_pid = getpid();
sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid);
if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) {
return (void *)-1;
}
unlink(filename);
map_address = mmap(address, BUFFER_SIZE,
PROT_READ | PROT_WRITE,
MAP_SHARED,
fd, 0);
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].attr = fd;
release_info[release_pos].func = alloc_hugetlbfile_free;
release_pos ++;
}
return map_address;
}
#endif
#ifdef SEEK_ADDRESS
static BLASULONG base_address = 0UL;
#else
static BLASULONG base_address = BASE_ADDRESS;
#endif
struct memory_t {
void *addr;
int used;
#ifndef __64BIT__
char dummy[48];
#else
char dummy[40];
#endif
};
static struct memory_t THREAD_LOCAL memory[BUFFERS_PER_THREAD];
static int memory_initialized = 0;
/* Memory allocation routine */
/* procpos ... indicates where it comes from */
/* 0 : Level 3 functions */
/* 1 : Level 2 functions */
/* 2 : Thread */
void *blas_memory_alloc(int procpos){
int position;
void *map_address;
void *(*memoryalloc[])(void *address) = {
#ifdef ALLOC_DEVICEDRIVER
alloc_devicedirver,
#endif
/* Hugetlb implicitly assumes ALLOC_SHM */
#ifdef ALLOC_SHM
alloc_shm,
#endif
#if ((defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS))
alloc_hugetlb,
#endif
#ifdef ALLOC_MMAP
alloc_mmap,
#endif
#ifdef ALLOC_QALLOC
alloc_qalloc,
#endif
#ifdef ALLOC_WINDOWS
alloc_windows,
#endif
#ifdef ALLOC_MALLOC
alloc_malloc,
#endif
NULL,
};
void *(**func)(void *address);
if (UNLIKELY_TO_BE_ZERO(memory_initialized)) {
/* Only allow a single thread to initialize memory system */
LOCK_COMMAND(&alloc_lock);
if (!memory_initialized) {
#ifdef DYNAMIC_ARCH
gotoblas_dynamic_init();
#endif
#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
gotoblas_affinity_init();
#endif
#ifdef SMP
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
#endif
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
#ifndef DYNAMIC_ARCH
blas_set_parameter();
#endif
#endif
memory_initialized = 1;
}
UNLOCK_COMMAND(&alloc_lock);
}
#ifdef DEBUG
printf("Alloc Start ...\n");
#endif
position = 0;
do {
if (!memory[position].used) goto allocation;
position ++;
} while (position < BUFFERS_PER_THREAD);
goto error;
allocation :
#ifdef DEBUG
printf(" Position -> %d\n", position);
#endif
memory[position].used = 1;
if (!memory[position].addr) {
do {
#ifdef DEBUG
printf("Allocation Start : %lx\n", base_address);
#endif
map_address = (void *)-1;
func = &memoryalloc[0];
while ((func != NULL) && (map_address == (void *) -1)) {
map_address = (*func)((void *)base_address);
#ifdef ALLOC_DEVICEDRIVER
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n");
}
#endif
#ifdef ALLOC_HUGETLBFILE
if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
#ifndef OS_WINDOWS
fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n");
#endif
}
#endif
#if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)
if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
#endif
func ++;
}
#ifdef DEBUG
printf(" Success -> %08lx\n", map_address);
#endif
if (((BLASLONG) map_address) == -1) base_address = 0UL;
if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
} while ((BLASLONG)map_address == -1);
memory[position].addr = map_address;
#ifdef DEBUG
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
#endif
}
#ifdef DEBUG
printf("Mapped : %p %3d\n\n",
(void *)memory[position].addr, position);
#endif
return (void *)memory[position].addr;
error:
printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
return NULL;
}
void blas_memory_free(void *free_area){
int position;
#ifdef DEBUG
printf("Unmapped Start : %p ...\n", free_area);
#endif
position = 0;
while ((position < BUFFERS_PER_THREAD) && (memory[position].addr != free_area))
position++;
if (memory[position].addr != free_area) goto error;
#ifdef DEBUG
printf(" Position : %d\n", position);
#endif
memory[position].used = 0;
#ifdef DEBUG
printf("Unmap Succeeded.\n\n");
#endif
return;
error:
printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
#ifdef DEBUG
for (position = 0; position < BUFFERS_PER_THREAD; position++)
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
#endif
return;
}
void *blas_memory_alloc_nolock(int unused) {
void *map_address;
map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
return map_address;
}
void blas_memory_free_nolock(void * map_address) {
free(map_address);
}
void blas_shutdown(void){
int pos;
#ifdef SMP
BLASFUNC(blas_thread_shutdown)();
#endif
for (pos = 0; pos < release_pos; pos ++) {
release_info[pos].func(&release_info[pos]);
}
#ifdef SEEK_ADDRESS
base_address = 0UL;
#else
base_address = BASE_ADDRESS;
#endif
for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){
memory[pos].addr = (void *)0;
memory[pos].used = 0;
}
return;
}
#if defined(OS_LINUX) && !defined(NO_WARMUP)
#ifdef SMP
#if defined(USE_PTHREAD_LOCK)
static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
#elif defined(USE_PTHREAD_SPINLOCK)
static pthread_spinlock_t init_lock = 0;
#else
static BLASULONG init_lock = 0UL;
#endif
#endif
static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
void *sa, void *sb, BLASLONG pos) {
#if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
size_t size;
BLASULONG buffer;
size = BUFFER_SIZE - PAGESIZE;
buffer = (BLASULONG)sa + GEMM_OFFSET_A;
#if defined(OS_LINUX) && !defined(NO_WARMUP)
if (hot_alloc != 2) {
#endif
#ifdef SMP
LOCK_COMMAND(&init_lock);
#endif
while (size > 0) {
*(int *)buffer = size;
buffer += PAGESIZE;
size -= PAGESIZE;
}
#ifdef SMP
UNLOCK_COMMAND(&init_lock);
#endif
size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE);
buffer = (BLASULONG)sa + GEMM_OFFSET_A;
while (size > 0) {
*(int *)buffer = size;
buffer += 64;
size -= 64;
}
#if defined(OS_LINUX) && !defined(NO_WARMUP)
}
#endif
#endif
}
#ifdef SMP
static void _init_thread_memory(void *buffer) {
blas_queue_t queue[MAX_CPU_NUMBER];
int num_cpu;
for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) {
blas_queue_init(&queue[num_cpu]);
queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL;
queue[num_cpu].routine = &_touch_memory;
queue[num_cpu].args = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
}
queue[num_cpu - 1].next = NULL;
queue[0].sa = buffer;
exec_blas(num_cpu, queue);
}
#endif
static void gotoblas_memory_init(void) {
void *buffer;
hot_alloc = 1;
buffer = (void *)blas_memory_alloc(0);
#ifdef SMP
if (blas_cpu_number == 0) blas_get_cpu_number();
#ifdef SMP_SERVER
if (blas_server_avail == 0) blas_thread_init();
#endif
_init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A));
#else
_touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0);
#endif
blas_memory_free(buffer);
}
#endif
/* Initialization for all function; this function should be called before main */
static int gotoblas_initialized = 0;
extern void openblas_read_env();
void CONSTRUCTOR gotoblas_init(void) {
if (gotoblas_initialized) return;
#ifdef SMP
openblas_fork_handler();
#endif
openblas_read_env();
#ifdef PROFILE
moncontrol (0);
#endif
#ifdef DYNAMIC_ARCH
gotoblas_dynamic_init();
#endif
#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
gotoblas_affinity_init();
#endif
#if defined(OS_LINUX) && !defined(NO_WARMUP)
gotoblas_memory_init();
#endif
//#if defined(OS_LINUX)
#if 0
struct rlimit curlimit;
if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
{
if ( curlimit.rlim_cur != curlimit.rlim_max )
{
curlimit.rlim_cur = curlimit.rlim_max;
setrlimit(RLIMIT_STACK, &curlimit);
}
}
#endif
#ifdef SMP
if (blas_cpu_number == 0) blas_get_cpu_number();
#ifdef SMP_SERVER
if (blas_server_avail == 0) blas_thread_init();
#endif
#endif
#ifdef FUNCTION_PROFILE
gotoblas_profile_init();
#endif
gotoblas_initialized = 1;
#ifdef PROFILE
moncontrol (1);
#endif
}
void DESTRUCTOR gotoblas_quit(void) {
if (gotoblas_initialized == 0) return;
blas_shutdown();
#ifdef PROFILE
moncontrol (0);
#endif
#ifdef FUNCTION_PROFILE
gotoblas_profile_quit();
#endif
#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
gotoblas_affinity_quit();
#endif
#ifdef DYNAMIC_ARCH
gotoblas_dynamic_quit();
#endif
gotoblas_initialized = 0;
#ifdef PROFILE
moncontrol (1);
#endif
}
#if defined(_MSC_VER) && !defined(__clang__)
BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved)
{
switch (ul_reason_for_call)
{
case DLL_PROCESS_ATTACH:
gotoblas_init();
break;
case DLL_THREAD_ATTACH:
break;
case DLL_THREAD_DETACH:
break;
case DLL_PROCESS_DETACH:
gotoblas_quit();
break;
default:
break;
}
return TRUE;
}
/*
This is to allow static linking.
Code adapted from Google performance tools:
https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc
Reference:
https://sourceware.org/ml/pthreads-win32/2008/msg00028.html
http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp
*/
static int on_process_term(void)
{
gotoblas_quit();
return 0;
}
#ifdef _WIN64
#pragma comment(linker, "/INCLUDE:_tls_used")
#else
#pragma comment(linker, "/INCLUDE:__tls_used")
#endif
#ifdef _WIN64
#pragma const_seg(".CRT$XLB")
#else
#pragma data_seg(".CRT$XLB")
#endif
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
#ifdef _WIN64
#pragma const_seg()
#else
#pragma data_seg()
#endif
#ifdef _WIN64
#pragma const_seg(".CRT$XTU")
#else
#pragma data_seg(".CRT$XTU")
#endif
static int(*p_process_term)(void) = on_process_term;
#ifdef _WIN64
#pragma const_seg()
#else
#pragma data_seg()
#endif
#endif
#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
/* Don't call me; this is just work around for PGI / Sun bug */
void gotoblas_dummy_for_PGI(void) {
gotoblas_init();
gotoblas_quit();
#if 0
asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
#else
asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
#endif
}
#endif