diff --git a/driver/others/memory.c b/driver/others/memory.c index d69e52e97..85f790615 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -13,9 +13,9 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the OpenBLAS project nor the names of - its contributors may be used to endorse or promote products - derived from this software without specific prior written + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" @@ -139,6 +139,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FIXED_PAGESIZE 4096 #endif +#ifndef BUFFERS_PER_THREAD +#ifdef USE_OPENMP +#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) +#else +#define BUFFERS_PER_THREAD NUM_BUFFERS +#endif +#endif + #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) #if defined(_MSC_VER) && !defined(__clang__) @@ -213,7 +221,7 @@ int i,n; ret = sched_getaffinity(0,size,cpusetp); if (ret!=0) return nums; ret = CPU_COUNT_S(size,cpusetp); - if (ret > 0 && ret < nums) nums = ret; + if (ret > 0 && ret < nums) nums = ret; CPU_FREE(cpusetp); return nums; #endif @@ -415,8 +423,15 @@ struct release_t { int hugetlb_allocated = 0; -static struct release_t release_info[NUM_BUFFERS]; -static int release_pos = 0; +#if defined(OS_WINDOWS) +#define THREAD_LOCAL __declspec(thread) +#define UNLIKELY_TO_BE_ZERO(x) (x) +#else +#define THREAD_LOCAL __thread +#define UNLIKELY_TO_BE_ZERO(x) (__builtin_expect(x, 0)) +#endif +static struct release_t THREAD_LOCAL release_info[BUFFERS_PER_THREAD]; +static int THREAD_LOCAL release_pos = 0; #if defined(OS_LINUX) && !defined(NO_WARMUP) static int hot_alloc = 0; @@ -459,15 +474,9 @@ static void *alloc_mmap(void *address){ } if (map_address != (void *)-1) { -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif } #ifdef OS_LINUX @@ -611,15 +620,9 @@ static void *alloc_mmap(void *address){ #endif if (map_address != (void *)-1) { -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif } return map_address; @@ -872,7 +875,7 @@ static void *alloc_hugetlb(void *address){ tp.PrivilegeCount = 1; tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; - + if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) { CloseHandle(hToken); return (void*)-1; @@ -961,20 +964,17 @@ static BLASULONG base_address = 0UL; static BLASULONG base_address = BASE_ADDRESS; #endif -static volatile struct { - BLASULONG lock; +struct memory_t { void *addr; -#if defined(WHEREAMI) && !defined(USE_OPENMP) - int pos; -#endif int used; #ifndef __64BIT__ char dummy[48]; #else char dummy[40]; #endif +}; -} memory[NUM_BUFFERS]; +static struct memory_t THREAD_LOCAL memory[BUFFERS_PER_THREAD]; static int memory_initialized = 0; @@ -987,9 +987,6 @@ static int memory_initialized = 0; void *blas_memory_alloc(int procpos){ int position; -#if defined(WHEREAMI) && !defined(USE_OPENMP) - int mypos; -#endif void *map_address; @@ -1020,102 +1017,48 @@ void *blas_memory_alloc(int procpos){ }; void *(**func)(void *address); -#if defined(USE_OPENMP) - if (!memory_initialized) { -#endif + if (UNLIKELY_TO_BE_ZERO(memory_initialized)) { - LOCK_COMMAND(&alloc_lock); + /* Only allow a single thread to initialize memory system */ + LOCK_COMMAND(&alloc_lock); - if (!memory_initialized) { - -#if defined(WHEREAMI) && !defined(USE_OPENMP) - for (position = 0; position < NUM_BUFFERS; position ++){ - memory[position].addr = (void *)0; - memory[position].pos = -1; - memory[position].used = 0; - memory[position].lock = 0; - } -#endif + if (!memory_initialized) { #ifdef DYNAMIC_ARCH - gotoblas_dynamic_init(); + gotoblas_dynamic_init(); #endif #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) - gotoblas_affinity_init(); + gotoblas_affinity_init(); #endif #ifdef SMP - if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); + if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); #endif #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) #ifndef DYNAMIC_ARCH - blas_set_parameter(); + blas_set_parameter(); #endif #endif - memory_initialized = 1; + memory_initialized = 1; + } + UNLOCK_COMMAND(&alloc_lock); } - UNLOCK_COMMAND(&alloc_lock); -#if defined(USE_OPENMP) - } -#endif #ifdef DEBUG printf("Alloc Start ...\n"); -#endif - -#if defined(WHEREAMI) && !defined(USE_OPENMP) - - mypos = WhereAmI(); - - position = mypos; - while (position >= NUM_BUFFERS) position >>= 1; - - do { - if (!memory[position].used && (memory[position].pos == mypos)) { -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#else - blas_lock(&memory[position].lock); -#endif - if (!memory[position].used) goto allocation; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#else - blas_unlock(&memory[position].lock); -#endif - } - - position ++; - - } while (position < NUM_BUFFERS); - - #endif position = 0; do { -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#else - if (!memory[position].used) { - blas_lock(&memory[position].lock); -#endif if (!memory[position].used) goto allocation; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#else - blas_unlock(&memory[position].lock); - } -#endif - position ++; - } while (position < NUM_BUFFERS); + } while (position < BUFFERS_PER_THREAD); goto error; @@ -1126,11 +1069,6 @@ void *blas_memory_alloc(int procpos){ #endif memory[position].used = 1; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#else - blas_unlock(&memory[position].lock); -#endif if (!memory[position].addr) { do { @@ -1148,14 +1086,14 @@ void *blas_memory_alloc(int procpos){ #ifdef ALLOC_DEVICEDRIVER if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { - fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n"); + fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n"); } #endif #ifdef ALLOC_HUGETLBFILE if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { #ifndef OS_WINDOWS - fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n"); + fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n"); #endif } #endif @@ -1176,44 +1114,13 @@ void *blas_memory_alloc(int procpos){ } while ((BLASLONG)map_address == -1); -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif memory[position].addr = map_address; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif #ifdef DEBUG printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); #endif } -#if defined(WHEREAMI) && !defined(USE_OPENMP) - - if (memory[position].pos == -1) memory[position].pos = mypos; - -#endif - -#ifdef DYNAMIC_ARCH - - if (memory_initialized == 1) { - - LOCK_COMMAND(&alloc_lock); - - if (memory_initialized == 1) { - - if (!gotoblas) gotoblas_dynamic_init(); - - memory_initialized = 2; - } - - UNLOCK_COMMAND(&alloc_lock); - - } -#endif - - #ifdef DEBUG printf("Mapped : %p %3d\n\n", (void *)memory[position].addr, position); @@ -1222,7 +1129,7 @@ void *blas_memory_alloc(int procpos){ return (void *)memory[position].addr; error: - printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); + printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n"); return NULL; } @@ -1236,10 +1143,7 @@ void blas_memory_free(void *free_area){ #endif position = 0; -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif - while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) + while ((position < BUFFERS_PER_THREAD) && (memory[position].addr != free_area)) position++; if (memory[position].addr != free_area) goto error; @@ -1248,13 +1152,7 @@ void blas_memory_free(void *free_area){ printf(" Position : %d\n", position); #endif - // arm: ensure all writes are finished before other thread takes this memory - WMB; - memory[position].used = 0; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif #ifdef DEBUG printf("Unmap Succeeded.\n\n"); @@ -1266,11 +1164,8 @@ void blas_memory_free(void *free_area){ printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); #ifdef DEBUG - for (position = 0; position < NUM_BUFFERS; position++) + for (position = 0; position < BUFFERS_PER_THREAD; position++) printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); -#endif -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); #endif return; } @@ -1293,8 +1188,6 @@ void blas_shutdown(void){ BLASFUNC(blas_thread_shutdown)(); #endif - LOCK_COMMAND(&alloc_lock); - for (pos = 0; pos < release_pos; pos ++) { release_info[pos].func(&release_info[pos]); } @@ -1305,17 +1198,11 @@ void blas_shutdown(void){ base_address = BASE_ADDRESS; #endif - for (pos = 0; pos < NUM_BUFFERS; pos ++){ + for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){ memory[pos].addr = (void *)0; memory[pos].used = 0; -#if defined(WHEREAMI) && !defined(USE_OPENMP) - memory[pos].pos = -1; -#endif - memory[pos].lock = 0; } - UNLOCK_COMMAND(&alloc_lock); - return; }