diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c index a7c28f4c2..c38a2632d 100644 --- a/lapack/potrf/potrf_parallel.c +++ b/lapack/potrf/potrf_parallel.c @@ -105,6 +105,14 @@ typedef struct { BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; +#ifdef HAVE_C11 +#define atomic_load_long(p) __atomic_load_n(p, __ATOMIC_RELAXED) +#define atomic_store_long(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED) +#else +#define atomic_load_long(p) (BLASLONG)(*(volatile BLASLONG*)(p)) +#define atomic_store_long(p, v) (*(volatile BLASLONG *)(p)) = (v) +#endif + #ifndef KERNEL_OPERATION #ifndef COMPLEX @@ -233,14 +241,18 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } #ifndef LOWER + MB; for (i = 0; i <= mypos; i++) - job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; + atomic_store_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside], (BLASLONG)buffer[bufferside]); + // job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; #else + MB for (i = mypos; i < args -> nthreads; i++) - job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; + atomic_store_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside], (BLASLONG)buffer[bufferside]); +// job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; #endif - WMB; +// WMB; } min_i = m_to - m_from; @@ -271,14 +283,21 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { /* thread has to wait */ - if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + if (current != mypos) + do { + jw = atomic_load_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside]); + } while (jw == 0); + MB; + + //while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, lda, m_from, xxx); if (m_from + min_i >= m_to) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + atomic_store_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside], job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0); +// job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; } } @@ -323,7 +342,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, c, lda, is, xxx); if (is + min_i >= m_to) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + atomic_store_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside], job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0); +// job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; } } @@ -337,9 +357,18 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for (i = 0; i < args -> nthreads; i++) { if (i != mypos) { - for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { + for (xxx = 0; xxx < DIVIDE_RATE; xxx++) + #if 1 + { + do { + jw = atomic_load_long(&job[mypos].working[i][CACHE_LINE_SIZE * xxx]); + } while (jw); + MB; + } +#else while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; - } +#endif + // } } }