Merge pull request #2469 from AGSaidi/acq-rel-2
Use acq/rel semantics to pass flags/pointers in getrf_parallel.
This commit is contained in:
commit
dbef479227
|
@ -68,23 +68,14 @@ double sqrt(double);
|
||||||
#define GETRF_FACTOR 1.00
|
#define GETRF_FACTOR 1.00
|
||||||
|
|
||||||
|
|
||||||
#if defined(USE_PTHREAD_LOCK)
|
#if (__STDC_VERSION__ >= 201112L)
|
||||||
static pthread_mutex_t getrf_lock = PTHREAD_MUTEX_INITIALIZER;
|
#define atomic_load_long(p) __atomic_load_n(p, __ATOMIC_RELAXED)
|
||||||
#elif defined(USE_PTHREAD_SPINLOCK)
|
#define atomic_store_long(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED)
|
||||||
static pthread_spinlock_t getrf_lock = 0;
|
|
||||||
#else
|
#else
|
||||||
static BLASULONG getrf_lock = 0UL;
|
#define atomic_load_long(p) (BLASLONG)(*(volatile BLASLONG*)(p))
|
||||||
|
#define atomic_store_long(p, v) (*(volatile BLASLONG *)(p)) = (v)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(USE_PTHREAD_LOCK)
|
|
||||||
static pthread_mutex_t getrf_flag_lock = PTHREAD_MUTEX_INITIALIZER;
|
|
||||||
#elif defined(USE_PTHREAD_SPINLOCK)
|
|
||||||
static pthread_spinlock_t getrf_flag_lock = 0;
|
|
||||||
#else
|
|
||||||
static BLASULONG getrf_flag_lock = 0UL;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
static __inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) {
|
static __inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) {
|
||||||
|
@ -119,11 +110,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
|
||||||
FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE;
|
FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE;
|
||||||
FLOAT *sbb = sb;
|
FLOAT *sbb = sb;
|
||||||
|
|
||||||
#if __STDC_VERSION__ >= 201112L
|
|
||||||
_Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d;
|
|
||||||
#else
|
|
||||||
volatile BLASLONG *flag = (volatile BLASLONG *)args -> d;
|
volatile BLASLONG *flag = (volatile BLASLONG *)args -> d;
|
||||||
#endif
|
|
||||||
|
|
||||||
blasint *ipiv = (blasint *)args -> c;
|
blasint *ipiv = (blasint *)args -> c;
|
||||||
|
|
||||||
|
@ -180,7 +167,10 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((js + REAL_GEMM_R >= n) && (mypos >= 0)) flag[mypos * CACHE_LINE_SIZE] = 0;
|
if ((js + REAL_GEMM_R >= n) && (mypos >= 0)) {
|
||||||
|
MB;
|
||||||
|
atomic_store_long(&flag[mypos * CACHE_LINE_SIZE], 0);
|
||||||
|
}
|
||||||
|
|
||||||
for (is = 0; is < m; is += GEMM_P){
|
for (is = 0; is < m; is += GEMM_P){
|
||||||
min_i = m - is;
|
min_i = m - is;
|
||||||
|
@ -201,14 +191,10 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
|
||||||
/* Non blocking implementation */
|
/* Non blocking implementation */
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
#if __STDC_VERSION__ >= 201112L
|
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||||
_Atomic
|
|
||||||
#else
|
|
||||||
volatile
|
|
||||||
#endif
|
|
||||||
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
|
||||||
} job_t;
|
} job_t;
|
||||||
|
|
||||||
|
|
||||||
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||||
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||||
|
|
||||||
|
@ -246,11 +232,8 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
|
||||||
|
|
||||||
blasint *ipiv = (blasint *)args -> c;
|
blasint *ipiv = (blasint *)args -> c;
|
||||||
BLASLONG jw;
|
BLASLONG jw;
|
||||||
#if __STDC_VERSION__ >= 201112L
|
|
||||||
_Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d;
|
|
||||||
#else
|
|
||||||
volatile BLASLONG *flag = (volatile BLASLONG *)args -> d;
|
volatile BLASLONG *flag = (volatile BLASLONG *)args -> d;
|
||||||
#endif
|
|
||||||
if (args -> a == NULL) {
|
if (args -> a == NULL) {
|
||||||
TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb);
|
TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb);
|
||||||
sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
||||||
|
@ -280,10 +263,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
|
||||||
#if 1
|
#if 1
|
||||||
{
|
{
|
||||||
do {
|
do {
|
||||||
LOCK_COMMAND(&getrf_lock);
|
jw = atomic_load_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside]);
|
||||||
jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside];
|
|
||||||
UNLOCK_COMMAND(&getrf_lock);
|
|
||||||
} while (jw);
|
} while (jw);
|
||||||
|
MB;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {};
|
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {};
|
||||||
|
@ -326,21 +308,17 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
|
||||||
}
|
}
|
||||||
MB;
|
MB;
|
||||||
for (i = 0; i < args -> nthreads; i++) {
|
for (i = 0; i < args -> nthreads; i++) {
|
||||||
LOCK_COMMAND(&getrf_lock);
|
atomic_store_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside], (BLASLONG)buffer[bufferside]);
|
||||||
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
|
||||||
UNLOCK_COMMAND(&getrf_lock);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOCK_COMMAND(&getrf_flag_lock);
|
MB;
|
||||||
flag[mypos * CACHE_LINE_SIZE] = 0;
|
atomic_store_long(&flag[mypos * CACHE_LINE_SIZE], 0);
|
||||||
UNLOCK_COMMAND(&getrf_flag_lock);
|
|
||||||
|
|
||||||
if (m == 0) {
|
if (m == 0) {
|
||||||
|
MB;
|
||||||
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
|
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
|
||||||
LOCK_COMMAND(&getrf_lock);
|
atomic_store_long(&job[mypos].working[mypos][CACHE_LINE_SIZE * xxx], 0);
|
||||||
job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0;
|
|
||||||
UNLOCK_COMMAND(&getrf_lock);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -366,10 +344,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
|
||||||
if ((current != mypos) && (!is)) {
|
if ((current != mypos) && (!is)) {
|
||||||
#if 1
|
#if 1
|
||||||
do {
|
do {
|
||||||
LOCK_COMMAND(&getrf_lock);
|
jw = atomic_load_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside]);
|
||||||
jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside];
|
} while (jw == 0);
|
||||||
UNLOCK_COMMAND(&getrf_lock);
|
MB;
|
||||||
} while (jw == 0);
|
|
||||||
#else
|
#else
|
||||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {};
|
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {};
|
||||||
#endif
|
#endif
|
||||||
|
@ -381,9 +358,7 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
|
||||||
|
|
||||||
MB;
|
MB;
|
||||||
if (is + min_i >= m) {
|
if (is + min_i >= m) {
|
||||||
LOCK_COMMAND(&getrf_lock);
|
atomic_store_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside], 0);
|
||||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
|
||||||
UNLOCK_COMMAND(&getrf_lock);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -397,10 +372,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
|
||||||
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
|
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
|
||||||
#if 1
|
#if 1
|
||||||
do {
|
do {
|
||||||
LOCK_COMMAND(&getrf_lock);
|
jw = atomic_load_long(&job[mypos].working[i][CACHE_LINE_SIZE *xxx]);
|
||||||
jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx];
|
|
||||||
UNLOCK_COMMAND(&getrf_lock);
|
|
||||||
} while(jw != 0);
|
} while(jw != 0);
|
||||||
|
MB;
|
||||||
#else
|
#else
|
||||||
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {};
|
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {};
|
||||||
#endif
|
#endif
|
||||||
|
@ -443,12 +417,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE];
|
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE];
|
||||||
#else
|
#else
|
||||||
#if __STDC_VERSION__ >= 201112L
|
volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
|
||||||
_Atomic
|
|
||||||
#else
|
|
||||||
volatile
|
|
||||||
#endif
|
|
||||||
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef COMPLEX
|
#ifndef COMPLEX
|
||||||
|
@ -543,7 +512,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
if (width > mn - is - bk) width = mn - is - bk;
|
if (width > mn - is - bk) width = mn - is - bk;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (num_cpu > 0) exec_blas_async_wait(num_cpu, &queue[0]);
|
|
||||||
|
if (num_cpu > 0) {
|
||||||
|
WMB;
|
||||||
|
exec_blas_async_wait(num_cpu, &queue[0]);
|
||||||
|
}
|
||||||
|
|
||||||
mm = m - bk - is;
|
mm = m - bk - is;
|
||||||
nn = n - bk - is;
|
nn = n - bk - is;
|
||||||
|
@ -608,7 +581,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
queue[num_cpu].sa = NULL;
|
queue[num_cpu].sa = NULL;
|
||||||
queue[num_cpu].sb = NULL;
|
queue[num_cpu].sb = NULL;
|
||||||
queue[num_cpu].next = &queue[num_cpu + 1];
|
queue[num_cpu].next = &queue[num_cpu + 1];
|
||||||
flag[num_cpu * CACHE_LINE_SIZE] = 1;
|
atomic_store_long(&flag[num_cpu * CACHE_LINE_SIZE], 1);
|
||||||
|
|
||||||
num_cpu ++;
|
num_cpu ++;
|
||||||
|
|
||||||
|
@ -637,6 +610,8 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
if (num_cpu > 0) {
|
if (num_cpu > 0) {
|
||||||
queue[num_cpu - 1].next = NULL;
|
queue[num_cpu - 1].next = NULL;
|
||||||
|
|
||||||
|
WMB;
|
||||||
|
|
||||||
exec_blas_async(0, &queue[0]);
|
exec_blas_async(0, &queue[0]);
|
||||||
|
|
||||||
inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1);
|
inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1);
|
||||||
|
@ -647,14 +622,10 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
|
|
||||||
for (i = 0; i < num_cpu; i ++) {
|
for (i = 0; i < num_cpu; i ++) {
|
||||||
#if 1
|
#if 1
|
||||||
LOCK_COMMAND(&getrf_flag_lock);
|
do {
|
||||||
f=flag[i*CACHE_LINE_SIZE];
|
f = atomic_load_long(&flag[i*CACHE_LINE_SIZE]);
|
||||||
UNLOCK_COMMAND(&getrf_flag_lock);
|
} while (f != 0);
|
||||||
while (f!=0) {
|
MB;
|
||||||
LOCK_COMMAND(&getrf_flag_lock);
|
|
||||||
f=flag[i*CACHE_LINE_SIZE];
|
|
||||||
UNLOCK_COMMAND(&getrf_flag_lock);
|
|
||||||
};
|
|
||||||
#else
|
#else
|
||||||
while (flag[i*CACHE_LINE_SIZE]) {};
|
while (flag[i*CACHE_LINE_SIZE]) {};
|
||||||
#endif
|
#endif
|
||||||
|
@ -719,12 +690,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
BLASLONG range[MAX_CPU_NUMBER + 1];
|
BLASLONG range[MAX_CPU_NUMBER + 1];
|
||||||
|
|
||||||
BLASLONG width, nn, num_cpu;
|
BLASLONG width, nn, num_cpu;
|
||||||
#if __STDC_VERSION__ >= 201112L
|
volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
|
||||||
_Atomic
|
|
||||||
#else
|
|
||||||
volatile
|
|
||||||
#endif
|
|
||||||
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
|
|
||||||
|
|
||||||
#ifndef COMPLEX
|
#ifndef COMPLEX
|
||||||
#ifdef XDOUBLE
|
#ifdef XDOUBLE
|
||||||
|
@ -833,6 +799,8 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
nn = n - bk - is;
|
nn = n - bk - is;
|
||||||
if (width > nn) width = nn;
|
if (width > nn) width = nn;
|
||||||
|
|
||||||
|
WMB;
|
||||||
|
|
||||||
if (num_cpu > 1) exec_blas_async_wait(num_cpu - 1, &queue[1]);
|
if (num_cpu > 1) exec_blas_async_wait(num_cpu - 1, &queue[1]);
|
||||||
|
|
||||||
range[0] = 0;
|
range[0] = 0;
|
||||||
|
@ -867,7 +835,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
queue[num_cpu].sa = NULL;
|
queue[num_cpu].sa = NULL;
|
||||||
queue[num_cpu].sb = NULL;
|
queue[num_cpu].sb = NULL;
|
||||||
queue[num_cpu].next = &queue[num_cpu + 1];
|
queue[num_cpu].next = &queue[num_cpu + 1];
|
||||||
flag[num_cpu * CACHE_LINE_SIZE] = 1;
|
atomic_store_long(&flag[num_cpu * CACHE_LINE_SIZE], 1);
|
||||||
|
|
||||||
num_cpu ++;
|
num_cpu ++;
|
||||||
}
|
}
|
||||||
|
@ -882,6 +850,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
range_n_new[0] = offset + is;
|
range_n_new[0] = offset + is;
|
||||||
range_n_new[1] = offset + is + bk;
|
range_n_new[1] = offset + is + bk;
|
||||||
|
|
||||||
|
WMB;
|
||||||
if (num_cpu > 1) {
|
if (num_cpu > 1) {
|
||||||
|
|
||||||
exec_blas_async(1, &queue[1]);
|
exec_blas_async(1, &queue[1]);
|
||||||
|
@ -917,7 +886,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for (i = 1; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {};
|
for (i = 1; i < num_cpu; i ++) while (atomic_load_long(&flag[i * CACHE_LINE_SIZE])) {};
|
||||||
|
|
||||||
TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb);
|
TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue