diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index b3b1ce7bd..1fd848c6b 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -425,6 +425,10 @@ static int blas_thread_server(void *arg){ main_status[cpu] = MAIN_FINISH; #endif + // arm: make sure all results are written out _before_ + // thread is marked as done and other threads use them + WMB; + thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */ WMB; @@ -775,7 +779,12 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ stop = rpcc(); #endif - if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); + if ((num > 1) && queue -> next) { + exec_blas_async_wait(num - 1, queue -> next); + + // arm: make sure results from other threads are visible + MB; + } #ifdef TIMING_DEBUG fprintf(STDERR, "Thread[0] : %16lu %16lu (%8lu cycles)\n", diff --git a/driver/others/memory.c b/driver/others/memory.c index a562da377..49c57f911 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1153,6 +1153,9 @@ void blas_memory_free(void *free_area){ printf(" Position : %d\n", position); #endif + // arm: ensure all writes are finished before other thread takes this memory + WMB; + memory[position].used = 0; #ifdef DEBUG