Further improvements to memory.c. (#1625)

- Compiler TLS is now used only used when the compiler supports it
- If compiler TLS is unsupported, we use platform-specific TLS
- Only one variable (an index) is now in TLS
- We only access TLS once per alloc, and never when freeing
- Allocation / release info is now stored within the allocation itself, by
  over-allocating; this saves having external structures do the bookkeeping, and
  reduces some of the redundant data that was being stored (such as addresses)
- We never hit the alloc lock when not using SMP or when using OpenMP (that was
  my fault)
- Now that there are fewer tracking structures I think this is a bit easier to
  read than before
This commit is contained in:
oon3m0oo 2018-06-20 21:04:03 +01:00 committed by Martin Kroeker
parent f66b9c8826
commit a399d00425
1 changed files with 243 additions and 156 deletions

View File

@ -326,6 +326,8 @@ int goto_get_num_procs (void) {
return blas_cpu_number;
}
static void blas_memory_init();
void openblas_fork_handler()
{
// This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
@ -337,7 +339,7 @@ void openblas_fork_handler()
// implementation of OpenMP.
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
int err;
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, blas_memory_init);
if(err != 0)
openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
#endif
@ -415,23 +417,104 @@ int openblas_get_num_threads(void) {
#endif
}
struct release_t {
void *address;
void (*func)(struct release_t *);
long attr;
};
int hugetlb_allocated = 0;
#if defined(OS_WINDOWS)
#define THREAD_LOCAL __declspec(thread)
#define UNLIKELY_TO_BE_ZERO(x) (x)
#define LIKELY_ONE(x) (x)
#else
#define THREAD_LOCAL __thread
#define UNLIKELY_TO_BE_ZERO(x) (__builtin_expect(x, 0))
#define LIKELY_ONE(x) (__builtin_expect(x, 1))
#endif
static struct release_t THREAD_LOCAL release_info[BUFFERS_PER_THREAD];
static int THREAD_LOCAL release_pos = 0;
/* Stores information about the allocation and how to release it */
struct alloc_t {
/* Whether this allocation is being used */
int used;
/* Any special attributes needed when releasing this allocation */
int attr;
/* Function that can properly release this memory */
void (*release_func)(struct alloc_t *);
/* Pad to 64-byte alignment */
char pad[64 - 2 * sizeof(int) - sizeof(void(*))];
};
/* Convenience macros for storing release funcs */
#define STORE_RELEASE_FUNC(address, func) \
if (address != (void *)-1) { \
struct alloc_t *alloc_info = (struct alloc_t *)address; \
alloc_info->release_func = func; \
}
#define STORE_RELEASE_FUNC_WITH_ATTR(address, func, attr) \
if (address != (void *)-1) { \
struct alloc_t *alloc_info = (struct alloc_t *)address; \
alloc_info->release_func = func; \
alloc_info->attr = attr; \
}
/* The number of bytes that will be allocated for each buffer. When allocating
memory, we store an alloc_t followed by the actual buffer memory. This means
that each allocation always has its associated alloc_t, without the need
for an auxiliary tracking structure. */
static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t);
/* Clang supports TLS from version 2.8 */
#if defined(__clang__) && __clang_major__ > 2 || \
(__clang_minor__ == 2 || __clang_minor__ == 8)
#define HAS_COMPILER_TLS
#endif
/* GCC supports TLS from version 4.1 */
#if !defined(__clang__) && defined(__GNUC__) && \
(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
#define HAS_COMPILER_TLS
#endif
/* MSVC supports TLS from version 2005 */
#if defined(_MSC_VER) && _MSC_VER >= 1400
#define HAS_COMPILER_TLS
#endif
/* Versions of XCode before 8 did not properly support TLS */
#if defined(__apple_build_version__) && __apple_build_version__ < 8000042
#undef HAS_COMPILER_TLS
#endif
/* Android NDK's before version 12b did not support TLS */
#if defined(__ANDROID__) && defined(__clang__)
#if __has_include(<android/ndk-version.h>)
#include <android/ndk-version.h>
#endif
#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \
defined(__NDK_MINOR__) && \
((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
#undef HAS_COMPILER_TLS
#endif
#endif
/* Holds pointers to allocated memory */
#if defined(SMP) && !defined(USE_OPENMP)
/* This is the number of threads than can be spawned by the server, which is the
server plus the number of threads in the thread pool */
# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER
static int next_memory_table_pos = 0;
# if defined(HAS_COMPILER_TLS)
/* Use compiler generated thread-local-storage */
static int THREAD_LOCAL local_memory_table_pos = 0;
# else
/* Use system-dependent thread-local-storage */
# if defined(OS_WINDOWS)
static DWORD local_storage_key;
# else
static pthread_key_t local_storage_key;
# endif /* defined(OS_WINDOWS) */
# endif /* defined(HAS_COMPILER_TLS) */
#else
/* There is only one allocating thread when in single-threaded mode and when using OpenMP */
# define MAX_ALLOCATING_THREADS 1
#endif /* defined(SMP) && !defined(USE_OPENMP) */
static struct alloc_t * local_memory_table[MAX_ALLOCATING_THREADS][BUFFERS_PER_THREAD];
#if defined(OS_LINUX) && !defined(NO_WARMUP)
static int hot_alloc = 0;
@ -447,11 +530,41 @@ static pthread_spinlock_t alloc_lock = 0;
static BLASULONG alloc_lock = 0UL;
#endif
/* Returns a pointer to the start of the per-thread memory allocation data */
static __inline struct alloc_t ** get_memory_table() {
#if defined(SMP) && !defined(USE_OPENMP)
# if !defined(HAS_COMPILER_TLS)
# if defined(OS_WINDOWS)
int local_memory_table_pos = (int)::TlsGetValue(local_storage_key);
# else
int local_memory_table_pos = (int)pthread_getspecific(local_storage_key);
# endif /* defined(OS_WINDOWS) */
# endif /* !defined(HAS_COMPILER_TLS) */
if (!local_memory_table_pos) {
LOCK_COMMAND(&alloc_lock);
local_memory_table_pos = next_memory_table_pos++;
UNLOCK_COMMAND(&alloc_lock);
if (next_memory_table_pos > MAX_ALLOCATING_THREADS)
printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n");
# if !defined(HAS_COMPILER_TLS)
# if defined(OS_WINDOWS)
::TlsSetValue(local_storage_key, (void*)local_memory_table_pos);
# else
pthread_setspecific(local_storage_key, (void*)local_memory_table_pos);
# endif /* defined(OS_WINDOWS) */
# endif /* !defined(HAS_COMPILER_TLS) */
}
return local_memory_table[local_memory_table_pos];
#else
return local_memory_table[0];
#endif /* defined(SMP) && !defined(USE_OPENMP) */
}
#ifdef ALLOC_MMAP
static void alloc_mmap_free(struct release_t *release){
static void alloc_mmap_free(struct alloc_t *alloc_info){
if (munmap(release -> address, BUFFER_SIZE)) {
if (munmap(alloc_info, allocation_block_size)) {
printf("OpenBLAS : munmap failed\n");
}
}
@ -465,22 +578,18 @@ static void *alloc_mmap(void *address){
if (address){
map_address = mmap(address,
BUFFER_SIZE,
allocation_block_size,
MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
} else {
map_address = mmap(address,
BUFFER_SIZE,
allocation_block_size,
MMAP_ACCESS, MMAP_POLICY, -1, 0);
}
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
}
STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
#ifdef OS_LINUX
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
#endif
return map_address;
@ -533,25 +642,25 @@ static void *alloc_mmap(void *address){
if (address){
/* Just give up use advanced operation */
map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
map_address = mmap(address, allocation_block_size, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
#ifdef OS_LINUX
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
#endif
} else {
#if defined(OS_LINUX) && !defined(NO_WARMUP)
if (hot_alloc == 0) {
map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0);
map_address = mmap(NULL, allocation_block_size, MMAP_ACCESS, MMAP_POLICY, -1, 0);
#ifdef OS_LINUX
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
#endif
} else {
#endif
map_address = mmap(NULL, BUFFER_SIZE * SCALING,
map_address = mmap(NULL, allocation_block_size * SCALING,
MMAP_ACCESS, MMAP_POLICY, -1, 0);
if (map_address != (void *)-1) {
@ -559,7 +668,7 @@ static void *alloc_mmap(void *address){
#ifdef OS_LINUX
#ifdef DEBUG
int ret=0;
ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
if(ret==-1){
int errsv=errno;
perror("OpenBLAS alloc_mmap:");
@ -567,7 +676,7 @@ static void *alloc_mmap(void *address){
}
#else
my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
#endif
#endif
@ -575,7 +684,7 @@ static void *alloc_mmap(void *address){
allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
start = (BLASULONG)map_address;
current = (SCALING - 1) * BUFFER_SIZE;
current = (SCALING - 1) * allocation_block_size;
while(current > 0) {
*(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
@ -590,7 +699,7 @@ static void *alloc_mmap(void *address){
best = (BLASULONG)-1;
best_address = map_address;
while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) {
current = run_bench(start, allocsize);
@ -606,7 +715,7 @@ static void *alloc_mmap(void *address){
if ((BLASULONG)best_address > (BLASULONG)map_address)
munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address);
munmap((void *)((BLASULONG)best_address + allocation_block_size), (SCALING - 1) * allocation_block_size + (BLASULONG)map_address - (BLASULONG)best_address);
map_address = best_address;
@ -619,11 +728,7 @@ static void *alloc_mmap(void *address){
}
#endif
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
}
STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
return map_address;
}
@ -635,9 +740,9 @@ static void *alloc_mmap(void *address){
#ifdef ALLOC_MALLOC
static void alloc_malloc_free(struct release_t *release){
static void alloc_malloc_free(struct alloc_t *alloc_info){
free(release -> address);
free(alloc_info);
}
@ -645,15 +750,11 @@ static void *alloc_malloc(void *address){
void *map_address;
map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
map_address = (void *)malloc(allocation_block_size + FIXED_PAGESIZE);
if (map_address == (void *)NULL) map_address = (void *)-1;
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_malloc_free;
release_pos ++;
}
STORE_RELEASE_FUNC(map_address, alloc_malloc_free);
return map_address;
@ -670,24 +771,20 @@ void *qfree (void *address);
#define QCOMMS 0x2
#define QFAST 0x4
static void alloc_qalloc_free(struct release_t *release){
static void alloc_qalloc_free(struct alloc_t *alloc_info){
qfree(release -> address);
qfree(alloc_info);
}
static void *alloc_qalloc(void *address){
void *map_address;
map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE);
map_address = (void *)qalloc(QCOMMS | QFAST, allocation_block_size + FIXED_PAGESIZE);
if (map_address == (void *)NULL) map_address = (void *)-1;
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_qalloc_free;
release_pos ++;
}
STORE_RELEASE_FUNC(map_address, alloc_qalloc_free);
return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
}
@ -696,9 +793,9 @@ static void *alloc_qalloc(void *address){
#ifdef ALLOC_WINDOWS
static void alloc_windows_free(struct release_t *release){
static void alloc_windows_free(struct alloc_t *alloc_info){
VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT);
VirtualFree(alloc_info, allocation_block_size, MEM_DECOMMIT);
}
@ -706,17 +803,13 @@ static void *alloc_windows(void *address){
void *map_address;
map_address = VirtualAlloc(address,
BUFFER_SIZE,
allocation_block_size,
MEM_RESERVE | MEM_COMMIT,
PAGE_READWRITE);
if (map_address == (void *)NULL) map_address = (void *)-1;
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_windows_free;
release_pos ++;
}
STORE_RELEASE_FUNC(map_address, alloc_windows_free);
return map_address;
}
@ -728,13 +821,14 @@ static void *alloc_windows(void *address){
#define DEVICEDRIVER_NAME "/dev/mapper"
#endif
static void alloc_devicedirver_free(struct release_t *release){
static void alloc_devicedirver_free(struct alloc_t *alloc_info){
if (munmap(release -> address, BUFFER_SIZE)) {
int attr = alloc_info -> attr;
if (munmap(address, allocation_block_size)) {
printf("OpenBLAS : Bugphysarea unmap failed.\n");
}
if (close(release -> attr)) {
if (close(attr)) {
printf("OpenBLAS : Bugphysarea close failed.\n");
}
@ -751,17 +845,12 @@ static void *alloc_devicedirver(void *address){
}
map_address = mmap(address, BUFFER_SIZE,
map_address = mmap(address, allocation_block_size,
PROT_READ | PROT_WRITE,
MAP_FILE | MAP_SHARED,
fd, 0);
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].attr = fd;
release_info[release_pos].func = alloc_devicedirver_free;
release_pos ++;
}
STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_devicedirver_free, fd);
return map_address;
}
@ -770,9 +859,9 @@ static void *alloc_devicedirver(void *address){
#ifdef ALLOC_SHM
static void alloc_shm_free(struct release_t *release){
static void alloc_shm_free(struct alloc_t *alloc_info){
if (shmdt(release -> address)) {
if (shmdt(alloc_info)) {
printf("OpenBLAS : Shared memory unmap failed.\n");
}
}
@ -781,22 +870,21 @@ static void *alloc_shm(void *address){
void *map_address;
int shmid;
shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600);
shmid = shmget(IPC_PRIVATE, allocation_block_size,IPC_CREAT | 0600);
map_address = (void *)shmat(shmid, address, 0);
if (map_address != (void *)-1){
#ifdef OS_LINUX
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
#endif
shmctl(shmid, IPC_RMID, 0);
release_info[release_pos].address = map_address;
release_info[release_pos].attr = shmid;
release_info[release_pos].func = alloc_shm_free;
release_pos ++;
struct alloc_t *alloc_info = (struct alloc_t *)map_address;
alloc_info->release_func = alloc_shm_free;
alloc_info->attr = shmid;
}
return map_address;
@ -804,23 +892,23 @@ static void *alloc_shm(void *address){
#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
static void alloc_hugetlb_free(struct release_t *release){
static void alloc_hugetlb_free(struct alloc_t *alloc_info){
#if defined(OS_LINUX) || defined(OS_AIX)
if (shmdt(release -> address)) {
if (shmdt(alloc_info)) {
printf("OpenBLAS : Hugepage unmap failed.\n");
}
#endif
#ifdef __sun__
munmap(release -> address, BUFFER_SIZE);
munmap(alloc_info, allocation_block_size);
#endif
#ifdef OS_WINDOWS
VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT);
VirtualFree(alloc_info, allocation_block_size, MEM_LARGE_PAGES | MEM_DECOMMIT);
#endif
@ -833,7 +921,7 @@ static void *alloc_hugetlb(void *address){
#if defined(OS_LINUX) || defined(OS_AIX)
int shmid;
shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,
shmid = shmget(IPC_PRIVATE, allocation_block_size,
#ifdef OS_LINUX
SHM_HUGETLB |
#endif
@ -846,7 +934,7 @@ static void *alloc_hugetlb(void *address){
map_address = (void *)shmat(shmid, address, SHM_RND);
#ifdef OS_LINUX
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
#endif
if (map_address != (void *)-1){
@ -863,7 +951,7 @@ static void *alloc_hugetlb(void *address){
mha.mha_pagesize = HUGE_PAGESIZE;
memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE);
map_address = (BLASULONG)memalign(HUGE_PAGESIZE, allocation_block_size);
#endif
#ifdef OS_WINDOWS
@ -887,7 +975,7 @@ static void *alloc_hugetlb(void *address){
}
map_address = (void *)VirtualAlloc(address,
BUFFER_SIZE,
allocation_block_size,
MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
PAGE_READWRITE);
@ -898,11 +986,7 @@ static void *alloc_hugetlb(void *address){
#endif
if (map_address != (void *)-1){
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_hugetlb_free;
release_pos ++;
}
STORE_RELEASE_FUNC(map_address, alloc_hugetlb_free);
return map_address;
}
@ -914,13 +998,14 @@ static void *alloc_hugetlb(void *address){
static int hugetlb_pid = 0;
static void alloc_hugetlbfile_free(struct release_t *release){
static void alloc_hugetlbfile_free(struct alloc_t *alloc_info){
if (munmap(release -> address, BUFFER_SIZE)) {
int attr = alloc_info -> attr;
if (munmap(alloc_info, allocation_block_size)) {
printf("OpenBLAS : HugeTLBfs unmap failed.\n");
}
if (close(release -> attr)) {
if (close(attr)) {
printf("OpenBLAS : HugeTLBfs close failed.\n");
}
}
@ -941,17 +1026,12 @@ static void *alloc_hugetlbfile(void *address){
unlink(filename);
map_address = mmap(address, BUFFER_SIZE,
map_address = mmap(address, allocation_block_size,
PROT_READ | PROT_WRITE,
MAP_SHARED,
fd, 0);
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].attr = fd;
release_info[release_pos].func = alloc_hugetlbfile_free;
release_pos ++;
}
STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_hugetlbfile_free, fd);
return map_address;
}
@ -964,19 +1044,11 @@ static BLASULONG base_address = 0UL;
static BLASULONG base_address = BASE_ADDRESS;
#endif
struct memory_t {
void *addr;
int used;
#ifndef __64BIT__
char dummy[48];
#if __STDC_VERSION__ >= 201112L
static _Atomic int memory_initialized = 0;
#else
char dummy[40];
static volatile int memory_initialized = 0;
#endif
};
static struct memory_t THREAD_LOCAL memory[BUFFERS_PER_THREAD];
static int memory_initialized = 0;
/* Memory allocation routine */
/* procpos ... indicates where it comes from */
@ -984,6 +1056,20 @@ static int memory_initialized = 0;
/* 1 : Level 2 functions */
/* 2 : Thread */
static void blas_memory_init(){
#if defined(SMP) && !defined(USE_OPENMP)
next_memory_table_pos = 0;
# if !defined(HAS_COMPILER_TLS)
# if defined(OS_WINDOWS)
local_storage_key = ::TlsAlloc();
# else
pthread_key_create(&local_storage_key, NULL);
# endif /* defined(OS_WINDOWS) */
# endif /* defined(HAS_COMPILER_TLS) */
#endif /* defined(SMP) && !defined(USE_OPENMP) */
memset(local_memory_table, 0, sizeof(local_memory_table));
}
void *blas_memory_alloc(int procpos){
int position;
@ -1016,14 +1102,17 @@ void *blas_memory_alloc(int procpos){
NULL,
};
void *(**func)(void *address);
struct alloc_t * alloc_info;
struct alloc_t ** alloc_table;
if (UNLIKELY_TO_BE_ZERO(memory_initialized)) {
if (!LIKELY_ONE(memory_initialized)) {
#if defined(SMP) && !defined(USE_OPENMP)
/* Only allow a single thread to initialize memory system */
LOCK_COMMAND(&alloc_lock);
if (!memory_initialized) {
#endif
blas_memory_init();
#ifdef DYNAMIC_ARCH
gotoblas_dynamic_init();
#endif
@ -1044,8 +1133,10 @@ void *blas_memory_alloc(int procpos){
memory_initialized = 1;
#if defined(SMP) && !defined(USE_OPENMP)
}
UNLOCK_COMMAND(&alloc_lock);
#endif
}
#ifdef DEBUG
@ -1053,9 +1144,9 @@ void *blas_memory_alloc(int procpos){
#endif
position = 0;
alloc_table = get_memory_table();
do {
if (!memory[position].used) goto allocation;
if (!alloc_table[position] || !alloc_table[position]->used) goto allocation;
position ++;
} while (position < BUFFERS_PER_THREAD);
@ -1068,9 +1159,8 @@ void *blas_memory_alloc(int procpos){
printf(" Position -> %d\n", position);
#endif
memory[position].used = 1;
if (!memory[position].addr) {
alloc_info = alloc_table[position];
if (!alloc_info) {
do {
#ifdef DEBUG
printf("Allocation Start : %lx\n", base_address);
@ -1110,23 +1200,24 @@ void *blas_memory_alloc(int procpos){
#endif
if (((BLASLONG) map_address) == -1) base_address = 0UL;
if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
if (base_address) base_address += allocation_block_size + FIXED_PAGESIZE;
} while ((BLASLONG)map_address == -1);
memory[position].addr = map_address;
alloc_table[position] = alloc_info = map_address;
#ifdef DEBUG
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
printf(" Mapping Succeeded. %p(%d)\n", (void *)alloc_info, position);
#endif
}
#ifdef DEBUG
printf("Mapped : %p %3d\n\n",
(void *)memory[position].addr, position);
printf("Mapped : %p %3d\n\n", (void *)alloc_info, position);
#endif
return (void *)memory[position].addr;
alloc_info->used = 1;
return (void *)(((char *)alloc_info) + sizeof(struct alloc_t));
error:
printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
@ -1134,25 +1225,19 @@ void *blas_memory_alloc(int procpos){
return NULL;
}
void blas_memory_free(void *free_area){
void blas_memory_free(void *buffer){
#ifdef DEBUG
int position;
struct alloc_t ** alloc_table;
#endif
/* Since we passed an offset pointer to the caller, get back to the actual allocation */
struct alloc_t *alloc_info = (void *)(((char *)buffer) - sizeof(struct alloc_t));
#ifdef DEBUG
printf("Unmapped Start : %p ...\n", free_area);
printf("Unmapped Start : %p ...\n", alloc_info);
#endif
position = 0;
while ((position < BUFFERS_PER_THREAD) && (memory[position].addr != free_area))
position++;
if (memory[position].addr != free_area) goto error;
#ifdef DEBUG
printf(" Position : %d\n", position);
#endif
memory[position].used = 0;
alloc_info->used = 0;
#ifdef DEBUG
printf("Unmap Succeeded.\n\n");
@ -1160,12 +1245,13 @@ void blas_memory_free(void *free_area){
return;
error:
printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
#ifdef DEBUG
for (position = 0; position < BUFFERS_PER_THREAD; position++)
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
alloc_table = get_memory_table();
for (position = 0; position < BUFFERS_PER_THREAD; position++){
if (alloc_table[position]) {
printf("%4ld %p : %d\n", position, alloc_table[position], alloc_table[position]->used);
}
}
#endif
return;
}
@ -1182,14 +1268,20 @@ void blas_memory_free_nolock(void * map_address) {
void blas_shutdown(void){
int pos;
int pos, thread;
#ifdef SMP
BLASFUNC(blas_thread_shutdown)();
#endif
for (pos = 0; pos < release_pos; pos ++) {
release_info[pos].func(&release_info[pos]);
for (thread = 0; thread < MAX_ALLOCATING_THREADS; thread ++){
for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){
struct alloc_t *alloc_info = local_memory_table[thread][pos];
if (alloc_info) {
alloc_info->release_func(alloc_info);
alloc_info = (void *)0;
}
}
}
#ifdef SEEK_ADDRESS
@ -1198,11 +1290,6 @@ void blas_shutdown(void){
base_address = BASE_ADDRESS;
#endif
for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){
memory[pos].addr = (void *)0;
memory[pos].used = 0;
}
return;
}
@ -1226,7 +1313,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
size_t size;
BLASULONG buffer;
size = BUFFER_SIZE - PAGESIZE;
size = allocation_block_size - PAGESIZE;
buffer = (BLASULONG)sa + GEMM_OFFSET_A;
#if defined(OS_LINUX) && !defined(NO_WARMUP)
@ -1247,7 +1334,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
UNLOCK_COMMAND(&init_lock);
#endif
size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE);
size = MIN((allocation_block_size - PAGESIZE), L2_SIZE);
buffer = (BLASULONG)sa + GEMM_OFFSET_A;
while (size > 0) {