Merge pull request #1618 from oon3m0oo/less_locking

Remove the need for most locking in memory.c.
This commit is contained in:
Martin Kroeker 2018-06-15 00:10:29 +02:00 committed by GitHub
commit 12603b7dbb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 43 additions and 156 deletions

View File

@ -13,9 +13,9 @@ met:
notice, this list of conditions and the following disclaimer in notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the the documentation and/or other materials provided with the
distribution. distribution.
3. Neither the name of the OpenBLAS project nor the names of 3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products its contributors may be used to endorse or promote products
derived from this software without specific prior written derived from this software without specific prior written
permission. permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
@ -139,6 +139,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FIXED_PAGESIZE 4096 #define FIXED_PAGESIZE 4096
#endif #endif
#ifndef BUFFERS_PER_THREAD
#ifdef USE_OPENMP
#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
#else
#define BUFFERS_PER_THREAD NUM_BUFFERS
#endif
#endif
#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
#if defined(_MSC_VER) && !defined(__clang__) #if defined(_MSC_VER) && !defined(__clang__)
@ -213,7 +221,7 @@ int i,n;
ret = sched_getaffinity(0,size,cpusetp); ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) return nums; if (ret!=0) return nums;
ret = CPU_COUNT_S(size,cpusetp); ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret; if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp); CPU_FREE(cpusetp);
return nums; return nums;
#endif #endif
@ -415,8 +423,15 @@ struct release_t {
int hugetlb_allocated = 0; int hugetlb_allocated = 0;
static struct release_t release_info[NUM_BUFFERS]; #if defined(OS_WINDOWS)
static int release_pos = 0; #define THREAD_LOCAL __declspec(thread)
#define UNLIKELY_TO_BE_ZERO(x) (x)
#else
#define THREAD_LOCAL __thread
#define UNLIKELY_TO_BE_ZERO(x) (__builtin_expect(x, 0))
#endif
static struct release_t THREAD_LOCAL release_info[BUFFERS_PER_THREAD];
static int THREAD_LOCAL release_pos = 0;
#if defined(OS_LINUX) && !defined(NO_WARMUP) #if defined(OS_LINUX) && !defined(NO_WARMUP)
static int hot_alloc = 0; static int hot_alloc = 0;
@ -459,15 +474,9 @@ static void *alloc_mmap(void *address){
} }
if (map_address != (void *)-1) { if (map_address != (void *)-1) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address; release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free; release_info[release_pos].func = alloc_mmap_free;
release_pos ++; release_pos ++;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
} }
#ifdef OS_LINUX #ifdef OS_LINUX
@ -611,15 +620,9 @@ static void *alloc_mmap(void *address){
#endif #endif
if (map_address != (void *)-1) { if (map_address != (void *)-1) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address; release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free; release_info[release_pos].func = alloc_mmap_free;
release_pos ++; release_pos ++;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
} }
return map_address; return map_address;
@ -872,7 +875,7 @@ static void *alloc_hugetlb(void *address){
tp.PrivilegeCount = 1; tp.PrivilegeCount = 1;
tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) { if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
CloseHandle(hToken); CloseHandle(hToken);
return (void*)-1; return (void*)-1;
@ -961,20 +964,17 @@ static BLASULONG base_address = 0UL;
static BLASULONG base_address = BASE_ADDRESS; static BLASULONG base_address = BASE_ADDRESS;
#endif #endif
static volatile struct { struct memory_t {
BLASULONG lock;
void *addr; void *addr;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
int pos;
#endif
int used; int used;
#ifndef __64BIT__ #ifndef __64BIT__
char dummy[48]; char dummy[48];
#else #else
char dummy[40]; char dummy[40];
#endif #endif
};
} memory[NUM_BUFFERS]; static struct memory_t THREAD_LOCAL memory[BUFFERS_PER_THREAD];
static int memory_initialized = 0; static int memory_initialized = 0;
@ -987,9 +987,6 @@ static int memory_initialized = 0;
void *blas_memory_alloc(int procpos){ void *blas_memory_alloc(int procpos){
int position; int position;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
int mypos;
#endif
void *map_address; void *map_address;
@ -1020,102 +1017,48 @@ void *blas_memory_alloc(int procpos){
}; };
void *(**func)(void *address); void *(**func)(void *address);
#if defined(USE_OPENMP) if (UNLIKELY_TO_BE_ZERO(memory_initialized)) {
if (!memory_initialized) {
#endif
LOCK_COMMAND(&alloc_lock); /* Only allow a single thread to initialize memory system */
LOCK_COMMAND(&alloc_lock);
if (!memory_initialized) { if (!memory_initialized) {
#if defined(WHEREAMI) && !defined(USE_OPENMP)
for (position = 0; position < NUM_BUFFERS; position ++){
memory[position].addr = (void *)0;
memory[position].pos = -1;
memory[position].used = 0;
memory[position].lock = 0;
}
#endif
#ifdef DYNAMIC_ARCH #ifdef DYNAMIC_ARCH
gotoblas_dynamic_init(); gotoblas_dynamic_init();
#endif #endif
#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
gotoblas_affinity_init(); gotoblas_affinity_init();
#endif #endif
#ifdef SMP #ifdef SMP
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
#endif #endif
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
#ifndef DYNAMIC_ARCH #ifndef DYNAMIC_ARCH
blas_set_parameter(); blas_set_parameter();
#endif #endif
#endif #endif
memory_initialized = 1; memory_initialized = 1;
}
UNLOCK_COMMAND(&alloc_lock);
} }
UNLOCK_COMMAND(&alloc_lock);
#if defined(USE_OPENMP)
}
#endif
#ifdef DEBUG #ifdef DEBUG
printf("Alloc Start ...\n"); printf("Alloc Start ...\n");
#endif
#if defined(WHEREAMI) && !defined(USE_OPENMP)
mypos = WhereAmI();
position = mypos;
while (position >= NUM_BUFFERS) position >>= 1;
do {
if (!memory[position].used && (memory[position].pos == mypos)) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#else
blas_lock(&memory[position].lock);
#endif
if (!memory[position].used) goto allocation;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#else
blas_unlock(&memory[position].lock);
#endif
}
position ++;
} while (position < NUM_BUFFERS);
#endif #endif
position = 0; position = 0;
do { do {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#else
if (!memory[position].used) {
blas_lock(&memory[position].lock);
#endif
if (!memory[position].used) goto allocation; if (!memory[position].used) goto allocation;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#else
blas_unlock(&memory[position].lock);
}
#endif
position ++; position ++;
} while (position < NUM_BUFFERS); } while (position < BUFFERS_PER_THREAD);
goto error; goto error;
@ -1126,11 +1069,6 @@ void *blas_memory_alloc(int procpos){
#endif #endif
memory[position].used = 1; memory[position].used = 1;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#else
blas_unlock(&memory[position].lock);
#endif
if (!memory[position].addr) { if (!memory[position].addr) {
do { do {
@ -1148,14 +1086,14 @@ void *blas_memory_alloc(int procpos){
#ifdef ALLOC_DEVICEDRIVER #ifdef ALLOC_DEVICEDRIVER
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n"); fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n");
} }
#endif #endif
#ifdef ALLOC_HUGETLBFILE #ifdef ALLOC_HUGETLBFILE
if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
#ifndef OS_WINDOWS #ifndef OS_WINDOWS
fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n"); fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n");
#endif #endif
} }
#endif #endif
@ -1176,44 +1114,13 @@ void *blas_memory_alloc(int procpos){
} while ((BLASLONG)map_address == -1); } while ((BLASLONG)map_address == -1);
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
memory[position].addr = map_address; memory[position].addr = map_address;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
#ifdef DEBUG #ifdef DEBUG
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
#endif #endif
} }
#if defined(WHEREAMI) && !defined(USE_OPENMP)
if (memory[position].pos == -1) memory[position].pos = mypos;
#endif
#ifdef DYNAMIC_ARCH
if (memory_initialized == 1) {
LOCK_COMMAND(&alloc_lock);
if (memory_initialized == 1) {
if (!gotoblas) gotoblas_dynamic_init();
memory_initialized = 2;
}
UNLOCK_COMMAND(&alloc_lock);
}
#endif
#ifdef DEBUG #ifdef DEBUG
printf("Mapped : %p %3d\n\n", printf("Mapped : %p %3d\n\n",
(void *)memory[position].addr, position); (void *)memory[position].addr, position);
@ -1222,7 +1129,7 @@ void *blas_memory_alloc(int procpos){
return (void *)memory[position].addr; return (void *)memory[position].addr;
error: error:
printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
return NULL; return NULL;
} }
@ -1236,10 +1143,7 @@ void blas_memory_free(void *free_area){
#endif #endif
position = 0; position = 0;
#if defined(SMP) && !defined(USE_OPENMP) while ((position < BUFFERS_PER_THREAD) && (memory[position].addr != free_area))
LOCK_COMMAND(&alloc_lock);
#endif
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
position++; position++;
if (memory[position].addr != free_area) goto error; if (memory[position].addr != free_area) goto error;
@ -1248,13 +1152,7 @@ void blas_memory_free(void *free_area){
printf(" Position : %d\n", position); printf(" Position : %d\n", position);
#endif #endif
// arm: ensure all writes are finished before other thread takes this memory
WMB;
memory[position].used = 0; memory[position].used = 0;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
#ifdef DEBUG #ifdef DEBUG
printf("Unmap Succeeded.\n\n"); printf("Unmap Succeeded.\n\n");
@ -1266,11 +1164,8 @@ void blas_memory_free(void *free_area){
printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
#ifdef DEBUG #ifdef DEBUG
for (position = 0; position < NUM_BUFFERS; position++) for (position = 0; position < BUFFERS_PER_THREAD; position++)
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
#endif
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif #endif
return; return;
} }
@ -1293,8 +1188,6 @@ void blas_shutdown(void){
BLASFUNC(blas_thread_shutdown)(); BLASFUNC(blas_thread_shutdown)();
#endif #endif
LOCK_COMMAND(&alloc_lock);
for (pos = 0; pos < release_pos; pos ++) { for (pos = 0; pos < release_pos; pos ++) {
release_info[pos].func(&release_info[pos]); release_info[pos].func(&release_info[pos]);
} }
@ -1305,17 +1198,11 @@ void blas_shutdown(void){
base_address = BASE_ADDRESS; base_address = BASE_ADDRESS;
#endif #endif
for (pos = 0; pos < NUM_BUFFERS; pos ++){ for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){
memory[pos].addr = (void *)0; memory[pos].addr = (void *)0;
memory[pos].used = 0; memory[pos].used = 0;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
memory[pos].pos = -1;
#endif
memory[pos].lock = 0;
} }
UNLOCK_COMMAND(&alloc_lock);
return; return;
} }