Refs #221 #246. Fixed the overflowing stack bug in mutlithreading BLAS3.

When NUM_THREADS(MAX_CPU_NUNBERS) is very large ,e.g. 256.

typedef struct {
  volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t;

job_t          job[MAX_CPU_NUMBER];

The job array is equal 8MB.

Thus, We use malloc instead of stack allocation.
This commit is contained in:
Zhang Xianyi 2013-07-08 01:07:05 +08:00
parent 886cbaf4e4
commit 5d3312142a
6 changed files with 122 additions and 9 deletions

View File

@ -48,6 +48,12 @@
#define SWITCH_RATIO 2
#endif
//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > 210
#define USE_ALLOC_HEAP
#endif
#ifndef GEMM3M_LOCAL
#if defined(NN)
#define GEMM3M_LOCAL GEMM3M_NN
@ -836,7 +842,11 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
BLASLONG range_M[MAX_CPU_NUMBER + 1];
BLASLONG range_N[MAX_CPU_NUMBER + 1];
job_t job[MAX_CPU_NUMBER];
#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
#else
job_t * job = NULL;
#endif
BLASLONG num_cpu_m, num_cpu_n;
@ -866,6 +876,15 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
newarg.alpha = args -> alpha;
newarg.beta = args -> beta;
newarg.nthreads = args -> nthreads;
#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
exit(1);
}
#endif
newarg.common = (void *)job;
if (!range_m) {
@ -945,6 +964,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
exec_blas(num_cpu_m, queue);
}
#ifdef USE_ALLOC_HEAP
free(job);
#endif
return 0;
}

View File

@ -48,6 +48,12 @@
#define SWITCH_RATIO 2
#endif
//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > 210
#define USE_ALLOC_HEAP
#endif
#ifndef SYRK_LOCAL
#if !defined(LOWER) && !defined(TRANS)
#define SYRK_LOCAL SYRK_UN
@ -502,7 +508,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
blas_arg_t newarg;
#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
#else
job_t * job = NULL;
#endif
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range[MAX_CPU_NUMBER + 100];
@ -556,6 +567,15 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
newarg.ldc = args -> ldc;
newarg.alpha = args -> alpha;
newarg.beta = args -> beta;
#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
exit(1);
}
#endif
newarg.common = (void *)job;
if (!range_n) {
@ -668,6 +688,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
exec_blas(num_cpu, queue);
}
#ifdef USE_ALLOC_HEAP
free(job);
#endif
return 0;
}

View File

@ -48,6 +48,12 @@
#define SWITCH_RATIO 2
#endif
//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > 210
#define USE_ALLOC_HEAP
#endif
#ifndef GEMM_LOCAL
#if defined(NN)
#define GEMM_LOCAL GEMM_NN
@ -531,7 +537,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
blas_arg_t newarg;
#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
#else
job_t * job = NULL;
#endif
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_M[MAX_CPU_NUMBER + 1];
@ -575,6 +586,15 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
newarg.alpha = args -> alpha;
newarg.beta = args -> beta;
newarg.nthreads = args -> nthreads;
#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
exit(1);
}
#endif
newarg.common = (void *)job;
#ifdef PARAMTEST
@ -660,6 +680,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
exec_blas(num_cpu_m, queue);
}
#ifdef USE_ALLOC_HEAP
free(job);
#endif
return 0;
}

View File

@ -217,7 +217,7 @@ int get_num_procs(void) {
}
return nums;
}
/*
void set_stack_limit(int limitMB){
int result=0;
struct rlimit rl;
@ -235,6 +235,7 @@ void set_stack_limit(int limitMB){
}
}
}
*/
#endif
/*
@ -1273,12 +1274,6 @@ void CONSTRUCTOR gotoblas_init(void) {
#endif
#ifdef DYNAMIC_ARCH
#if defined(SMP) && defined(OS_DARWIN) && MAX_CPU_NUMBER > 128
//Set stack limit to 16MB on Mac OS X
//when NUM_THREADS>128 and DYNAMIC_ARCH=1.
//Prevent the SEGFAULT bug.
set_stack_limit(16);
#endif
gotoblas_dynamic_init();
#endif

View File

@ -43,6 +43,12 @@ static FLOAT dm1 = -1.;
double sqrt(double);
//In this case, the recursive getrf_parallel may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > 90
#define USE_ALLOC_HEAP
#endif
#ifndef CACHE_LINE_SIZE
#define CACHE_LINE_SIZE 8
#endif
@ -356,7 +362,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
BLASLONG range_M[MAX_CPU_NUMBER + 1];
BLASLONG range_N[MAX_CPU_NUMBER + 1];
#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
#else
job_t * job=NULL;
#endif
BLASLONG width, nn, mm;
BLASLONG i, j, k, is, bk;
@ -401,7 +411,6 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
newarg.c = ipiv;
newarg.lda = lda;
newarg.common = (void *)job;
info = 0;
@ -427,6 +436,16 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
if (iinfo && !info) info = iinfo;
#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
exit(1);
}
#endif
newarg.common = (void *)job;
TRSM_ILTCOPY(bk, bk, a, lda, 0, sb);
sbb = (FLOAT *)((((long)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
@ -586,6 +605,10 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
is += bk;
}
#ifdef USE_ALLOC_HEAP
free(job);
#endif
return info;
}

View File

@ -41,6 +41,13 @@
#ifndef USE_SIMPLE_THREADED_LEVEL3
//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > 210
#define USE_ALLOC_HEAP
#endif
static FLOAT dm1 = -1.;
#ifndef KERNEL_FUNC
@ -342,7 +349,12 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
blas_arg_t newarg;
#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
#else
job_t * job = NULL;
#endif
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range[MAX_CPU_NUMBER + 100];
@ -387,6 +399,15 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
newarg.c = args -> c;
newarg.lda = args -> lda;
newarg.alpha = args -> alpha;
#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
exit(1);
}
#endif
newarg.common = (void *)job;
n_from = 0;
@ -494,6 +515,10 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
exec_blas(num_cpu, queue);
}
#ifdef USE_ALLOC_HEAP
free(job);
#endif
return 0;
}