diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 6f4e20610..ce028a7fc 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -140,6 +140,16 @@ typedef struct { } thread_status_t; +#if (__STDC_VERSION__ >= 201112L) +#define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_RELAXED) +#define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED) +#else +#define atomic_load_queue(p) (blas_queue_t*)(*(volatile blas_queue_t**)(p)) +#define atomic_store_queue(p, v) (*(volatile blas_queue_t* volatile*)(p) = (v)) +#endif + + + static thread_status_t thread_status[MAX_CPU_NUMBER] __attribute__((aligned(ATTRIBUTE_SIZE))); #ifndef THREAD_TIMEOUT @@ -312,20 +322,19 @@ blas_queue_t *tscq; last_tick = (unsigned int)rpcc(); - pthread_mutex_lock (&thread_status[cpu].lock); - tscq=thread_status[cpu].queue; - pthread_mutex_unlock (&thread_status[cpu].lock); + tscq = atomic_load_queue(&thread_status[cpu].queue); while(!tscq) { YIELDING; if ((unsigned int)rpcc() - last_tick > thread_timeout) { - pthread_mutex_lock (&thread_status[cpu].lock); - if (!thread_status[cpu].queue) { + if (!atomic_load_queue(&thread_status[cpu].queue)) { + pthread_mutex_lock (&thread_status[cpu].lock); thread_status[cpu].status = THREAD_STATUS_SLEEP; - while (thread_status[cpu].status == THREAD_STATUS_SLEEP) { + while (thread_status[cpu].status == THREAD_STATUS_SLEEP && + !atomic_load_queue(&thread_status[cpu].queue)) { #ifdef MONITOR main_status[cpu] = MAIN_SLEEPING; @@ -333,19 +342,18 @@ blas_queue_t *tscq; pthread_cond_wait(&thread_status[cpu].wakeup, &thread_status[cpu].lock); } + pthread_mutex_unlock(&thread_status[cpu].lock); } - pthread_mutex_unlock(&thread_status[cpu].lock); - last_tick = (unsigned int)rpcc(); } - pthread_mutex_lock (&thread_status[cpu].lock); - tscq=thread_status[cpu].queue; - pthread_mutex_unlock (&thread_status[cpu].lock); + + tscq = atomic_load_queue(&thread_status[cpu].queue); } - queue = thread_status[cpu].queue; + queue = atomic_load_queue(&thread_status[cpu].queue); + MB; if ((long)queue == -1) break; @@ -360,9 +368,7 @@ blas_queue_t *tscq; if (queue) { int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; - pthread_mutex_lock (&thread_status[cpu].lock); - thread_status[cpu].queue = (blas_queue_t *)1; - pthread_mutex_unlock (&thread_status[cpu].lock); + atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1); sa = queue -> sa; sb = queue -> sb; @@ -442,13 +448,9 @@ blas_queue_t *tscq; // arm: make sure all results are written out _before_ // thread is marked as done and other threads use them - WMB; + MB; + atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)0); - pthread_mutex_lock (&thread_status[cpu].lock); - thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */ - pthread_mutex_unlock (&thread_status[cpu].lock); - - WMB; } @@ -566,12 +568,9 @@ int blas_thread_init(void){ for(i = 0; i < blas_num_threads - 1; i++){ - thread_status[i].queue = (blas_queue_t *)NULL; + atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0); thread_status[i].status = THREAD_STATUS_WAKEUP; - pthread_mutex_init(&thread_status[i].lock, NULL); - pthread_cond_init (&thread_status[i].wakeup, NULL); - #ifdef NEED_STACKATTR ret=pthread_create(&blas_threads[i], &attr, &blas_thread_server, (void *)i); @@ -655,7 +654,8 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ if (queue -> mode & BLAS_NODE) { do { - while((thread_status[i].node != node || thread_status[i].queue) && (i < blas_num_threads - 1)) i ++; + + while((thread_status[i].node != node || atomic_load_queue(&thread_status[i].queue)) && (i < blas_num_threads - 1)) i ++; if (i < blas_num_threads - 1) break; @@ -669,36 +669,26 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ } while (1); } else { - pthread_mutex_lock (&thread_status[i].lock); - tsiq = thread_status[i].queue; - pthread_mutex_unlock (&thread_status[i].lock); + tsiq = atomic_load_queue(&thread_status[i].queue); while(tsiq) { i ++; if (i >= blas_num_threads - 1) i = 0; - pthread_mutex_lock (&thread_status[i].lock); - tsiq = thread_status[i].queue; - pthread_mutex_unlock (&thread_status[i].lock); + tsiq = atomic_load_queue(&thread_status[i].queue); } } #else - pthread_mutex_lock (&thread_status[i].lock); - tsiq=thread_status[i].queue ; - pthread_mutex_unlock (&thread_status[i].lock); + tsiq = atomic_load_queue(&thread_status[i].queue); while(tsiq) { i ++; if (i >= blas_num_threads - 1) i = 0; - pthread_mutex_lock (&thread_status[i].lock); - tsiq=thread_status[i].queue ; - pthread_mutex_unlock (&thread_status[i].lock); + tsiq = atomic_load_queue(&thread_status[i].queue); } #endif queue -> assigned = i; - WMB; - pthread_mutex_lock (&thread_status[i].lock); - thread_status[i].queue = queue; - pthread_mutex_unlock (&thread_status[i].lock); - WMB; + MB; + + atomic_store_queue(&thread_status[i].queue, queue); queue = queue -> next; pos ++; @@ -718,9 +708,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ pos = current -> assigned; - pthread_mutex_lock (&thread_status[pos].lock); - tspq=thread_status[pos].queue; - pthread_mutex_unlock (&thread_status[pos].lock); + tspq = atomic_load_queue(&thread_status[pos].queue); if ((BLASULONG)tspq > 1) { pthread_mutex_lock (&thread_status[pos].lock); @@ -752,24 +740,20 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ while ((num > 0) && queue) { - pthread_mutex_lock(&thread_status[queue->assigned].lock); - tsqq=thread_status[queue -> assigned].queue; - pthread_mutex_unlock(&thread_status[queue->assigned].lock); + tsqq = atomic_load_queue(&thread_status[queue->assigned].queue); while(tsqq) { YIELDING; - pthread_mutex_lock(&thread_status[queue->assigned].lock); - tsqq=thread_status[queue -> assigned].queue; - pthread_mutex_unlock(&thread_status[queue->assigned].lock); - - + tsqq = atomic_load_queue(&thread_status[queue->assigned].queue); }; queue = queue -> next; num --; } + MB; + #ifdef SMP_DEBUG fprintf(STDERR, "Done.\n\n"); #endif @@ -880,7 +864,7 @@ void goto_set_num_threads(int num_threads) { for(i = blas_num_threads - 1; i < num_threads - 1; i++){ - thread_status[i].queue = (blas_queue_t *)NULL; + atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0); thread_status[i].status = THREAD_STATUS_WAKEUP; pthread_mutex_init(&thread_status[i].lock, NULL); @@ -971,12 +955,11 @@ int BLASFUNC(blas_thread_shutdown)(void){ for (i = 0; i < blas_num_threads - 1; i++) { + pthread_mutex_lock (&thread_status[i].lock); - thread_status[i].queue = (blas_queue_t *)-1; - + atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1); thread_status[i].status = THREAD_STATUS_WAKEUP; - pthread_cond_signal (&thread_status[i].wakeup); pthread_mutex_unlock(&thread_status[i].lock);