Merge pull request #1618 from oon3m0oo/less_locking

Remove the need for most locking in memory.c.
This commit is contained in:
Martin Kroeker 2018-06-15 00:10:29 +02:00 committed by GitHub
commit 12603b7dbb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 43 additions and 156 deletions

View File

@ -139,6 +139,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FIXED_PAGESIZE 4096 #define FIXED_PAGESIZE 4096
#endif #endif
#ifndef BUFFERS_PER_THREAD
#ifdef USE_OPENMP
#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
#else
#define BUFFERS_PER_THREAD NUM_BUFFERS
#endif
#endif
#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
#if defined(_MSC_VER) && !defined(__clang__) #if defined(_MSC_VER) && !defined(__clang__)
@ -415,8 +423,15 @@ struct release_t {
int hugetlb_allocated = 0; int hugetlb_allocated = 0;
static struct release_t release_info[NUM_BUFFERS]; #if defined(OS_WINDOWS)
static int release_pos = 0; #define THREAD_LOCAL __declspec(thread)
#define UNLIKELY_TO_BE_ZERO(x) (x)
#else
#define THREAD_LOCAL __thread
#define UNLIKELY_TO_BE_ZERO(x) (__builtin_expect(x, 0))
#endif
static struct release_t THREAD_LOCAL release_info[BUFFERS_PER_THREAD];
static int THREAD_LOCAL release_pos = 0;
#if defined(OS_LINUX) && !defined(NO_WARMUP) #if defined(OS_LINUX) && !defined(NO_WARMUP)
static int hot_alloc = 0; static int hot_alloc = 0;
@ -459,15 +474,9 @@ static void *alloc_mmap(void *address){
} }
if (map_address != (void *)-1) { if (map_address != (void *)-1) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address; release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free; release_info[release_pos].func = alloc_mmap_free;
release_pos ++; release_pos ++;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
} }
#ifdef OS_LINUX #ifdef OS_LINUX
@ -611,15 +620,9 @@ static void *alloc_mmap(void *address){
#endif #endif
if (map_address != (void *)-1) { if (map_address != (void *)-1) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address; release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free; release_info[release_pos].func = alloc_mmap_free;
release_pos ++; release_pos ++;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
} }
return map_address; return map_address;
@ -961,20 +964,17 @@ static BLASULONG base_address = 0UL;
static BLASULONG base_address = BASE_ADDRESS; static BLASULONG base_address = BASE_ADDRESS;
#endif #endif
static volatile struct { struct memory_t {
BLASULONG lock;
void *addr; void *addr;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
int pos;
#endif
int used; int used;
#ifndef __64BIT__ #ifndef __64BIT__
char dummy[48]; char dummy[48];
#else #else
char dummy[40]; char dummy[40];
#endif #endif
};
} memory[NUM_BUFFERS]; static struct memory_t THREAD_LOCAL memory[BUFFERS_PER_THREAD];
static int memory_initialized = 0; static int memory_initialized = 0;
@ -987,9 +987,6 @@ static int memory_initialized = 0;
void *blas_memory_alloc(int procpos){ void *blas_memory_alloc(int procpos){
int position; int position;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
int mypos;
#endif
void *map_address; void *map_address;
@ -1020,102 +1017,48 @@ void *blas_memory_alloc(int procpos){
}; };
void *(**func)(void *address); void *(**func)(void *address);
#if defined(USE_OPENMP) if (UNLIKELY_TO_BE_ZERO(memory_initialized)) {
if (!memory_initialized) {
#endif
LOCK_COMMAND(&alloc_lock); /* Only allow a single thread to initialize memory system */
LOCK_COMMAND(&alloc_lock);
if (!memory_initialized) { if (!memory_initialized) {
#if defined(WHEREAMI) && !defined(USE_OPENMP)
for (position = 0; position < NUM_BUFFERS; position ++){
memory[position].addr = (void *)0;
memory[position].pos = -1;
memory[position].used = 0;
memory[position].lock = 0;
}
#endif
#ifdef DYNAMIC_ARCH #ifdef DYNAMIC_ARCH
gotoblas_dynamic_init(); gotoblas_dynamic_init();
#endif #endif
#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
gotoblas_affinity_init(); gotoblas_affinity_init();
#endif #endif
#ifdef SMP #ifdef SMP
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
#endif #endif
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
#ifndef DYNAMIC_ARCH #ifndef DYNAMIC_ARCH
blas_set_parameter(); blas_set_parameter();
#endif #endif
#endif #endif
memory_initialized = 1; memory_initialized = 1;
}
UNLOCK_COMMAND(&alloc_lock);
} }
UNLOCK_COMMAND(&alloc_lock);
#if defined(USE_OPENMP)
}
#endif
#ifdef DEBUG #ifdef DEBUG
printf("Alloc Start ...\n"); printf("Alloc Start ...\n");
#endif
#if defined(WHEREAMI) && !defined(USE_OPENMP)
mypos = WhereAmI();
position = mypos;
while (position >= NUM_BUFFERS) position >>= 1;
do {
if (!memory[position].used && (memory[position].pos == mypos)) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#else
blas_lock(&memory[position].lock);
#endif
if (!memory[position].used) goto allocation;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#else
blas_unlock(&memory[position].lock);
#endif
}
position ++;
} while (position < NUM_BUFFERS);
#endif #endif
position = 0; position = 0;
do { do {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#else
if (!memory[position].used) {
blas_lock(&memory[position].lock);
#endif
if (!memory[position].used) goto allocation; if (!memory[position].used) goto allocation;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#else
blas_unlock(&memory[position].lock);
}
#endif
position ++; position ++;
} while (position < NUM_BUFFERS); } while (position < BUFFERS_PER_THREAD);
goto error; goto error;
@ -1126,11 +1069,6 @@ void *blas_memory_alloc(int procpos){
#endif #endif
memory[position].used = 1; memory[position].used = 1;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#else
blas_unlock(&memory[position].lock);
#endif
if (!memory[position].addr) { if (!memory[position].addr) {
do { do {
@ -1148,14 +1086,14 @@ void *blas_memory_alloc(int procpos){
#ifdef ALLOC_DEVICEDRIVER #ifdef ALLOC_DEVICEDRIVER
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n"); fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n");
} }
#endif #endif
#ifdef ALLOC_HUGETLBFILE #ifdef ALLOC_HUGETLBFILE
if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
#ifndef OS_WINDOWS #ifndef OS_WINDOWS
fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n"); fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n");
#endif #endif
} }
#endif #endif
@ -1176,44 +1114,13 @@ void *blas_memory_alloc(int procpos){
} while ((BLASLONG)map_address == -1); } while ((BLASLONG)map_address == -1);
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
memory[position].addr = map_address; memory[position].addr = map_address;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
#ifdef DEBUG #ifdef DEBUG
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
#endif #endif
} }
#if defined(WHEREAMI) && !defined(USE_OPENMP)
if (memory[position].pos == -1) memory[position].pos = mypos;
#endif
#ifdef DYNAMIC_ARCH
if (memory_initialized == 1) {
LOCK_COMMAND(&alloc_lock);
if (memory_initialized == 1) {
if (!gotoblas) gotoblas_dynamic_init();
memory_initialized = 2;
}
UNLOCK_COMMAND(&alloc_lock);
}
#endif
#ifdef DEBUG #ifdef DEBUG
printf("Mapped : %p %3d\n\n", printf("Mapped : %p %3d\n\n",
(void *)memory[position].addr, position); (void *)memory[position].addr, position);
@ -1222,7 +1129,7 @@ void *blas_memory_alloc(int procpos){
return (void *)memory[position].addr; return (void *)memory[position].addr;
error: error:
printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
return NULL; return NULL;
} }
@ -1236,10 +1143,7 @@ void blas_memory_free(void *free_area){
#endif #endif
position = 0; position = 0;
#if defined(SMP) && !defined(USE_OPENMP) while ((position < BUFFERS_PER_THREAD) && (memory[position].addr != free_area))
LOCK_COMMAND(&alloc_lock);
#endif
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
position++; position++;
if (memory[position].addr != free_area) goto error; if (memory[position].addr != free_area) goto error;
@ -1248,13 +1152,7 @@ void blas_memory_free(void *free_area){
printf(" Position : %d\n", position); printf(" Position : %d\n", position);
#endif #endif
// arm: ensure all writes are finished before other thread takes this memory
WMB;
memory[position].used = 0; memory[position].used = 0;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
#ifdef DEBUG #ifdef DEBUG
printf("Unmap Succeeded.\n\n"); printf("Unmap Succeeded.\n\n");
@ -1266,11 +1164,8 @@ void blas_memory_free(void *free_area){
printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
#ifdef DEBUG #ifdef DEBUG
for (position = 0; position < NUM_BUFFERS; position++) for (position = 0; position < BUFFERS_PER_THREAD; position++)
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
#endif
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif #endif
return; return;
} }
@ -1293,8 +1188,6 @@ void blas_shutdown(void){
BLASFUNC(blas_thread_shutdown)(); BLASFUNC(blas_thread_shutdown)();
#endif #endif
LOCK_COMMAND(&alloc_lock);
for (pos = 0; pos < release_pos; pos ++) { for (pos = 0; pos < release_pos; pos ++) {
release_info[pos].func(&release_info[pos]); release_info[pos].func(&release_info[pos]);
} }
@ -1305,17 +1198,11 @@ void blas_shutdown(void){
base_address = BASE_ADDRESS; base_address = BASE_ADDRESS;
#endif #endif
for (pos = 0; pos < NUM_BUFFERS; pos ++){ for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){
memory[pos].addr = (void *)0; memory[pos].addr = (void *)0;
memory[pos].used = 0; memory[pos].used = 0;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
memory[pos].pos = -1;
#endif
memory[pos].lock = 0;
} }
UNLOCK_COMMAND(&alloc_lock);
return; return;
} }