Refs #174. Fixed the overflowing buffer bug of multithreading hbmv and sbmv.
Instead of using thread 0 buffer, each thread uses its own sb buffer. Thus, it can avoid overflowing thread 0 buffer.
This commit is contained in:
parent
5c8bf6ae0e
commit
5155e3f509
|
@ -65,7 +65,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
||||||
|
|
||||||
a = (FLOAT *)args -> a;
|
a = (FLOAT *)args -> a;
|
||||||
x = (FLOAT *)args -> b;
|
x = (FLOAT *)args -> b;
|
||||||
y = (FLOAT *)args -> c;
|
|
||||||
|
|
||||||
lda = args -> lda;
|
lda = args -> lda;
|
||||||
incx = args -> ldb;
|
incx = args -> ldb;
|
||||||
|
@ -76,6 +75,10 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
||||||
n_from = 0;
|
n_from = 0;
|
||||||
n_to = n;
|
n_to = n;
|
||||||
|
|
||||||
|
//Use y as each thread's n* COMPSIZE elements in sb buffer
|
||||||
|
y = buffer;
|
||||||
|
buffer += ((COMPSIZE * n + 1023) & ~1023);
|
||||||
|
|
||||||
if (range_m) {
|
if (range_m) {
|
||||||
n_from = *(range_m + 0);
|
n_from = *(range_m + 0);
|
||||||
n_to = *(range_m + 1);
|
n_to = *(range_m + 1);
|
||||||
|
@ -83,7 +86,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
||||||
a += n_from * lda * COMPSIZE;
|
a += n_from * lda * COMPSIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (range_n) y += *range_n * COMPSIZE;
|
|
||||||
|
|
||||||
if (incx != 1) {
|
if (incx != 1) {
|
||||||
COPY_K(n, x, incx, buffer, 1);
|
COPY_K(n, x, incx, buffer, 1);
|
||||||
|
@ -331,7 +333,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
||||||
|
|
||||||
if (num_cpu) {
|
if (num_cpu) {
|
||||||
queue[0].sa = NULL;
|
queue[0].sa = NULL;
|
||||||
queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE;
|
queue[0].sb = buffer;
|
||||||
queue[num_cpu - 1].next = NULL;
|
queue[num_cpu - 1].next = NULL;
|
||||||
|
|
||||||
exec_blas(num_cpu, queue);
|
exec_blas(num_cpu, queue);
|
||||||
|
@ -344,7 +346,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
||||||
#else
|
#else
|
||||||
ONE, ZERO,
|
ONE, ZERO,
|
||||||
#endif
|
#endif
|
||||||
buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0);
|
(FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
AXPYU_K(n, 0, 0,
|
AXPYU_K(n, 0, 0,
|
||||||
|
|
|
@ -385,6 +385,7 @@ static int blas_thread_server(void *arg){
|
||||||
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
queue->sb=sb;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef MONITOR
|
#ifdef MONITOR
|
||||||
|
|
Loading…
Reference in New Issue