Fix barriers in level3_thread

This commit is contained in:
Ali Saidi 2020-02-29 17:27:18 +00:00
parent 430ee31e66
commit 97ce6bbce2
1 changed files with 10 additions and 7 deletions

View File

@ -351,8 +351,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Make sure if no one is using workspace */ /* Make sure if no one is using workspace */
START_RPCC(); START_RPCC();
for (i = 0; i < args -> nthreads; i++) for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
STOP_RPCC(waiting1); STOP_RPCC(waiting1);
MB;
#if defined(FUSED_GEMM) && !defined(TIMING) #if defined(FUSED_GEMM) && !defined(TIMING)
@ -395,10 +396,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
} }
#endif #endif
WMB;
/* Set flag so other threads can access local region of B */ /* Set flag so other threads can access local region of B */
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++)
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
WMB;
} }
/* Get regions of B from other threads and apply kernel */ /* Get regions of B from other threads and apply kernel */
@ -417,8 +418,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Wait until other region of B is initialized */ /* Wait until other region of B is initialized */
START_RPCC(); START_RPCC();
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;}; while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
STOP_RPCC(waiting2); STOP_RPCC(waiting2);
MB;
/* Apply kernel with local region of A and part of other region of B */ /* Apply kernel with local region of A and part of other region of B */
START_RPCC(); START_RPCC();
@ -434,8 +436,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Clear synchronization flag if this thread is done with other region of B */ /* Clear synchronization flag if this thread is done with other region of B */
if (m_to - m_from == min_i) { if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB; WMB;
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
} }
} }
} while (current != mypos); } while (current != mypos);
@ -477,8 +479,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Clear synchronization flag if this thread is done with region of B */ /* Clear synchronization flag if this thread is done with region of B */
if (is + min_i >= m_to) { if (is + min_i >= m_to) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB; WMB;
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
} }
} }
@ -497,10 +499,11 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
START_RPCC(); START_RPCC();
for (i = 0; i < args -> nthreads; i++) { for (i = 0; i < args -> nthreads; i++) {
for (js = 0; js < DIVIDE_RATE; js++) { for (js = 0; js < DIVIDE_RATE; js++) {
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;}; while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
} }
} }
STOP_RPCC(waiting3); STOP_RPCC(waiting3);
MB;
#ifdef TIMING #ifdef TIMING
BLASLONG waiting = waiting1 + waiting2 + waiting3; BLASLONG waiting = waiting1 + waiting2 + waiting3;
@ -705,7 +708,7 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
} }
} }
} }
WMB;
/* Execute parallel computation */ /* Execute parallel computation */
exec_blas(nthreads, queue); exec_blas(nthreads, queue);
} }