use atomic operations as in the corresponding getrf
This commit is contained in:
parent
9af2a9dc3b
commit
2dda40d280
|
@ -105,6 +105,14 @@ typedef struct {
|
|||
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
} job_t;
|
||||
|
||||
#ifdef HAVE_C11
|
||||
#define atomic_load_long(p) __atomic_load_n(p, __ATOMIC_RELAXED)
|
||||
#define atomic_store_long(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED)
|
||||
#else
|
||||
#define atomic_load_long(p) (BLASLONG)(*(volatile BLASLONG*)(p))
|
||||
#define atomic_store_long(p, v) (*(volatile BLASLONG *)(p)) = (v)
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef KERNEL_OPERATION
|
||||
#ifndef COMPLEX
|
||||
|
@ -233,14 +241,18 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
}
|
||||
|
||||
#ifndef LOWER
|
||||
MB;
|
||||
for (i = 0; i <= mypos; i++)
|
||||
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
||||
atomic_store_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside], (BLASLONG)buffer[bufferside]);
|
||||
// job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
||||
#else
|
||||
MB
|
||||
for (i = mypos; i < args -> nthreads; i++)
|
||||
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
||||
atomic_store_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside], (BLASLONG)buffer[bufferside]);
|
||||
// job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
||||
#endif
|
||||
|
||||
WMB;
|
||||
// WMB;
|
||||
}
|
||||
|
||||
min_i = m_to - m_from;
|
||||
|
@ -271,14 +283,21 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
|
||||
|
||||
/* thread has to wait */
|
||||
if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
||||
if (current != mypos)
|
||||
do {
|
||||
jw = atomic_load_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside]);
|
||||
} while (jw == 0);
|
||||
MB;
|
||||
|
||||
//while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
||||
|
||||
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
|
||||
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
|
||||
c, lda, m_from, xxx);
|
||||
|
||||
if (m_from + min_i >= m_to) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
atomic_store_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside], job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0);
|
||||
// job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
|
@ -323,7 +342,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
c, lda, is, xxx);
|
||||
|
||||
if (is + min_i >= m_to) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
atomic_store_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside], job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0);
|
||||
// job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
|
@ -337,9 +357,18 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
for (i = 0; i < args -> nthreads; i++) {
|
||||
if (i != mypos) {
|
||||
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
|
||||
for (xxx = 0; xxx < DIVIDE_RATE; xxx++)
|
||||
#if 1
|
||||
{
|
||||
do {
|
||||
jw = atomic_load_long(&job[mypos].working[i][CACHE_LINE_SIZE * xxx]);
|
||||
} while (jw);
|
||||
MB;
|
||||
}
|
||||
#else
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
|
||||
}
|
||||
#endif
|
||||
// }
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue