Merge pull request #1618 from oon3m0oo/less_locking

Remove the need for most locking in memory.c.
This commit is contained in:
Martin Kroeker 2018-06-15 00:10:29 +02:00 committed by GitHub
commit 12603b7dbb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 43 additions and 156 deletions

View File

@ -13,9 +13,9 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
@ -139,6 +139,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FIXED_PAGESIZE 4096
#endif
#ifndef BUFFERS_PER_THREAD
#ifdef USE_OPENMP
#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
#else
#define BUFFERS_PER_THREAD NUM_BUFFERS
#endif
#endif
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
#if defined(_MSC_VER) && !defined(__clang__)
@ -213,7 +221,7 @@ int i,n;
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) return nums;
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
#endif
@ -415,8 +423,15 @@ struct release_t {
int hugetlb_allocated = 0;
static struct release_t release_info[NUM_BUFFERS];
static int release_pos = 0;
#if defined(OS_WINDOWS)
#define THREAD_LOCAL __declspec(thread)
#define UNLIKELY_TO_BE_ZERO(x) (x)
#else
#define THREAD_LOCAL __thread
#define UNLIKELY_TO_BE_ZERO(x) (__builtin_expect(x, 0))
#endif
static struct release_t THREAD_LOCAL release_info[BUFFERS_PER_THREAD];
static int THREAD_LOCAL release_pos = 0;
#if defined(OS_LINUX) && !defined(NO_WARMUP)
static int hot_alloc = 0;
@ -459,15 +474,9 @@ static void *alloc_mmap(void *address){
}
if (map_address != (void *)-1) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
}
#ifdef OS_LINUX
@ -611,15 +620,9 @@ static void *alloc_mmap(void *address){
#endif
if (map_address != (void *)-1) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
}
return map_address;
@ -872,7 +875,7 @@ static void *alloc_hugetlb(void *address){
tp.PrivilegeCount = 1;
tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
CloseHandle(hToken);
return (void*)-1;
@ -961,20 +964,17 @@ static BLASULONG base_address = 0UL;
static BLASULONG base_address = BASE_ADDRESS;
#endif
static volatile struct {
BLASULONG lock;
struct memory_t {
void *addr;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
int pos;
#endif
int used;
#ifndef __64BIT__
char dummy[48];
#else
char dummy[40];
#endif
};
} memory[NUM_BUFFERS];
static struct memory_t THREAD_LOCAL memory[BUFFERS_PER_THREAD];
static int memory_initialized = 0;
@ -987,9 +987,6 @@ static int memory_initialized = 0;
void *blas_memory_alloc(int procpos){
int position;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
int mypos;
#endif
void *map_address;
@ -1020,102 +1017,48 @@ void *blas_memory_alloc(int procpos){
};
void *(**func)(void *address);
#if defined(USE_OPENMP)
if (!memory_initialized) {
#endif
if (UNLIKELY_TO_BE_ZERO(memory_initialized)) {
LOCK_COMMAND(&alloc_lock);
/* Only allow a single thread to initialize memory system */
LOCK_COMMAND(&alloc_lock);
if (!memory_initialized) {
#if defined(WHEREAMI) && !defined(USE_OPENMP)
for (position = 0; position < NUM_BUFFERS; position ++){
memory[position].addr = (void *)0;
memory[position].pos = -1;
memory[position].used = 0;
memory[position].lock = 0;
}
#endif
if (!memory_initialized) {
#ifdef DYNAMIC_ARCH
gotoblas_dynamic_init();
gotoblas_dynamic_init();
#endif
#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
gotoblas_affinity_init();
gotoblas_affinity_init();
#endif
#ifdef SMP
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
#endif
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
#ifndef DYNAMIC_ARCH
blas_set_parameter();
blas_set_parameter();
#endif
#endif
memory_initialized = 1;
memory_initialized = 1;
}
UNLOCK_COMMAND(&alloc_lock);
}
UNLOCK_COMMAND(&alloc_lock);
#if defined(USE_OPENMP)
}
#endif
#ifdef DEBUG
printf("Alloc Start ...\n");
#endif
#if defined(WHEREAMI) && !defined(USE_OPENMP)
mypos = WhereAmI();
position = mypos;
while (position >= NUM_BUFFERS) position >>= 1;
do {
if (!memory[position].used && (memory[position].pos == mypos)) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#else
blas_lock(&memory[position].lock);
#endif
if (!memory[position].used) goto allocation;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#else
blas_unlock(&memory[position].lock);
#endif
}
position ++;
} while (position < NUM_BUFFERS);
#endif
position = 0;
do {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#else
if (!memory[position].used) {
blas_lock(&memory[position].lock);
#endif
if (!memory[position].used) goto allocation;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#else
blas_unlock(&memory[position].lock);
}
#endif
position ++;
} while (position < NUM_BUFFERS);
} while (position < BUFFERS_PER_THREAD);
goto error;
@ -1126,11 +1069,6 @@ void *blas_memory_alloc(int procpos){
#endif
memory[position].used = 1;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#else
blas_unlock(&memory[position].lock);
#endif
if (!memory[position].addr) {
do {
@ -1148,14 +1086,14 @@ void *blas_memory_alloc(int procpos){
#ifdef ALLOC_DEVICEDRIVER
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n");
}
#endif
#ifdef ALLOC_HUGETLBFILE
if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
#ifndef OS_WINDOWS
fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n");
#endif
}
#endif
@ -1176,44 +1114,13 @@ void *blas_memory_alloc(int procpos){
} while ((BLASLONG)map_address == -1);
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
memory[position].addr = map_address;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
#ifdef DEBUG
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
#endif
}
#if defined(WHEREAMI) && !defined(USE_OPENMP)
if (memory[position].pos == -1) memory[position].pos = mypos;
#endif
#ifdef DYNAMIC_ARCH
if (memory_initialized == 1) {
LOCK_COMMAND(&alloc_lock);
if (memory_initialized == 1) {
if (!gotoblas) gotoblas_dynamic_init();
memory_initialized = 2;
}
UNLOCK_COMMAND(&alloc_lock);
}
#endif
#ifdef DEBUG
printf("Mapped : %p %3d\n\n",
(void *)memory[position].addr, position);
@ -1222,7 +1129,7 @@ void *blas_memory_alloc(int procpos){
return (void *)memory[position].addr;
error:
printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
return NULL;
}
@ -1236,10 +1143,7 @@ void blas_memory_free(void *free_area){
#endif
position = 0;
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
while ((position < BUFFERS_PER_THREAD) && (memory[position].addr != free_area))
position++;
if (memory[position].addr != free_area) goto error;
@ -1248,13 +1152,7 @@ void blas_memory_free(void *free_area){
printf(" Position : %d\n", position);
#endif
// arm: ensure all writes are finished before other thread takes this memory
WMB;
memory[position].used = 0;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
#ifdef DEBUG
printf("Unmap Succeeded.\n\n");
@ -1266,11 +1164,8 @@ void blas_memory_free(void *free_area){
printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
#ifdef DEBUG
for (position = 0; position < NUM_BUFFERS; position++)
for (position = 0; position < BUFFERS_PER_THREAD; position++)
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
#endif
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
return;
}
@ -1293,8 +1188,6 @@ void blas_shutdown(void){
BLASFUNC(blas_thread_shutdown)();
#endif
LOCK_COMMAND(&alloc_lock);
for (pos = 0; pos < release_pos; pos ++) {
release_info[pos].func(&release_info[pos]);
}
@ -1305,17 +1198,11 @@ void blas_shutdown(void){
base_address = BASE_ADDRESS;
#endif
for (pos = 0; pos < NUM_BUFFERS; pos ++){
for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){
memory[pos].addr = (void *)0;
memory[pos].used = 0;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
memory[pos].pos = -1;
#endif
memory[pos].lock = 0;
}
UNLOCK_COMMAND(&alloc_lock);
return;
}