diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 7e871dec1..9debe178d 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -276,6 +276,9 @@ static void* blas_thread_server(void *arg){ unsigned int last_tick; void *buffer, *sa, *sb; blas_queue_t *queue; + +blas_queue_t *tscq; + #ifdef TIMING_DEBUG unsigned long start, stop; #endif @@ -309,8 +312,11 @@ static void* blas_thread_server(void *arg){ last_tick = (unsigned int)rpcc(); - while (!thread_status[cpu].queue) { + pthread_mutex_lock (&thread_status[cpu].lock); + tscq=thread_status[cpu].queue; + pthread_mutex_unlock (&thread_status[cpu].lock); + while(!tscq) { YIELDING; if ((unsigned int)rpcc() - last_tick > thread_timeout) { @@ -333,6 +339,9 @@ static void* blas_thread_server(void *arg){ last_tick = (unsigned int)rpcc(); } + pthread_mutex_lock (&thread_status[cpu].lock); + tscq=thread_status[cpu].queue; + pthread_mutex_unlock (&thread_status[cpu].lock); } @@ -351,7 +360,9 @@ static void* blas_thread_server(void *arg){ if (queue) { int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; + pthread_mutex_lock (&thread_status[cpu].lock); thread_status[cpu].queue = (blas_queue_t *)1; + pthread_mutex_unlock (&thread_status[cpu].lock); sa = queue -> sa; sb = queue -> sb; @@ -433,7 +444,10 @@ static void* blas_thread_server(void *arg){ // thread is marked as done and other threads use them WMB; + pthread_mutex_lock (&thread_status[cpu].lock); thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */ + pthread_mutex_unlock (&thread_status[cpu].lock); + WMB; } @@ -613,6 +627,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ #endif BLASLONG i = 0; blas_queue_t *current = queue; + blas_queue_t *tsiq,*tspq; #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST) int node = get_node(); int nodes = get_num_nodes(); @@ -660,15 +675,23 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ } } #else - while(thread_status[i].queue) { + pthread_mutex_lock (&thread_status[i].lock); + tsiq=thread_status[i].queue ; + pthread_mutex_unlock (&thread_status[i].lock); + while(tsiq) { i ++; if (i >= blas_num_threads - 1) i = 0; + pthread_mutex_lock (&thread_status[i].lock); + tsiq=thread_status[i].queue ; + pthread_mutex_unlock (&thread_status[i].lock); } #endif queue -> assigned = i; WMB; + pthread_mutex_lock (&thread_status[i].lock); thread_status[i].queue = queue; + pthread_mutex_unlock (&thread_status[i].lock); WMB; queue = queue -> next; @@ -689,11 +712,15 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ pos = current -> assigned; - if ((BLASULONG)thread_status[pos].queue > 1) { + pthread_mutex_lock (&thread_status[pos].lock); + tspq=thread_status[pos].queue; + pthread_mutex_unlock (&thread_status[pos].lock); + + if ((BLASULONG)tspq > 1) { + pthread_mutex_lock (&thread_status[pos].lock); if (thread_status[pos].status == THREAD_STATUS_SLEEP) { - pthread_mutex_lock (&thread_status[pos].lock); #ifdef MONITOR num_suspend ++; @@ -703,8 +730,9 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ thread_status[pos].status = THREAD_STATUS_WAKEUP; pthread_cond_signal(&thread_status[pos].wakeup); } - pthread_mutex_unlock(&thread_status[pos].lock); + } + pthread_mutex_unlock(&thread_status[pos].lock); } current = current -> next; @@ -714,11 +742,22 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ } int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ + blas_queue_t * tsqq; while ((num > 0) && queue) { - while(thread_status[queue -> assigned].queue) { + pthread_mutex_lock(&thread_status[queue->assigned].lock); + tsqq=thread_status[queue -> assigned].queue; + pthread_mutex_unlock(&thread_status[queue->assigned].lock); + + + while(tsqq) { YIELDING; + pthread_mutex_lock(&thread_status[queue->assigned].lock); + tsqq=thread_status[queue -> assigned].queue; + pthread_mutex_unlock(&thread_status[queue->assigned].lock); + + }; queue = queue -> next; diff --git a/driver/others/memory.c b/driver/others/memory.c index 5fd9da5c2..5e4c4c111 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -390,6 +390,16 @@ static void alloc_mmap_free(struct release_t *release){ } } +/* Global lock for memory allocation */ + +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t alloc_lock = 0; +#else +static BLASULONG alloc_lock = 0UL; +#endif + #ifdef NO_WARMUP static void *alloc_mmap(void *address){ @@ -406,9 +416,11 @@ static void *alloc_mmap(void *address){ } if (map_address != (void *)-1) { + LOCK_COMMAND(&alloc_lock); release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; + UNLOCK_COMMAND(&alloc_lock); } #ifdef OS_LINUX @@ -550,12 +562,14 @@ static void *alloc_mmap(void *address){ #if defined(OS_LINUX) && !defined(NO_WARMUP) } #endif + LOCK_COMMAND(&alloc_lock); if (map_address != (void *)-1) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; } + UNLOCK_COMMAND(&alloc_lock); return map_address; } @@ -889,15 +903,6 @@ static void *alloc_hugetlbfile(void *address){ } #endif -/* Global lock for memory allocation */ - -#if defined(USE_PTHREAD_LOCK) -static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER; -#elif defined(USE_PTHREAD_SPINLOCK) -static pthread_spinlock_t alloc_lock = 0; -#else -static BLASULONG alloc_lock = 0UL; -#endif #ifdef SEEK_ADDRESS static BLASULONG base_address = 0UL; @@ -963,45 +968,41 @@ void *blas_memory_alloc(int procpos){ NULL, }; void *(**func)(void *address); + LOCK_COMMAND(&alloc_lock); if (!memory_initialized) { - LOCK_COMMAND(&alloc_lock); - - if (!memory_initialized) { - #if defined(WHEREAMI) && !defined(USE_OPENMP) - for (position = 0; position < NUM_BUFFERS; position ++){ - memory[position].addr = (void *)0; - memory[position].pos = -1; - memory[position].used = 0; - memory[position].lock = 0; - } + for (position = 0; position < NUM_BUFFERS; position ++){ + memory[position].addr = (void *)0; + memory[position].pos = -1; + memory[position].used = 0; + memory[position].lock = 0; + } #endif #ifdef DYNAMIC_ARCH - gotoblas_dynamic_init(); + gotoblas_dynamic_init(); #endif #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) - gotoblas_affinity_init(); + gotoblas_affinity_init(); #endif #ifdef SMP - if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); + if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); #endif #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) #ifndef DYNAMIC_ARCH - blas_set_parameter(); + blas_set_parameter(); #endif #endif - memory_initialized = 1; - } + memory_initialized = 1; - UNLOCK_COMMAND(&alloc_lock); } + UNLOCK_COMMAND(&alloc_lock); #ifdef DEBUG printf("Alloc Start ...\n"); @@ -1034,14 +1035,14 @@ void *blas_memory_alloc(int procpos){ position = 0; do { - if (!memory[position].used) { +/* if (!memory[position].used) { */ blas_lock(&memory[position].lock); if (!memory[position].used) goto allocation; blas_unlock(&memory[position].lock); - } +/* } */ position ++; @@ -1103,7 +1104,9 @@ void *blas_memory_alloc(int procpos){ } while ((BLASLONG)map_address == -1); + LOCK_COMMAND(&alloc_lock); memory[position].addr = map_address; + UNLOCK_COMMAND(&alloc_lock); #ifdef DEBUG printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); @@ -1157,6 +1160,7 @@ void blas_memory_free(void *free_area){ #endif position = 0; + LOCK_COMMAND(&alloc_lock); while ((memory[position].addr != free_area) && (position < NUM_BUFFERS)) position++; @@ -1171,6 +1175,7 @@ void blas_memory_free(void *free_area){ WMB; memory[position].used = 0; + UNLOCK_COMMAND(&alloc_lock); #ifdef DEBUG printf("Unmap Succeeded.\n\n"); @@ -1185,6 +1190,7 @@ void blas_memory_free(void *free_area){ for (position = 0; position < NUM_BUFFERS; position++) printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); #endif + UNLOCK_COMMAND(&alloc_lock); return; }