When NUM_THREADS(MAX_CPU_NUNBERS) is very large ,e.g. 256. typedef struct { volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; job_t job[MAX_CPU_NUMBER]; The job array is equal 8MB. Thus, We use malloc instead of stack allocation.
This commit is contained in:
parent
886cbaf4e4
commit
5d3312142a
|
@ -48,6 +48,12 @@
|
|||
#define SWITCH_RATIO 2
|
||||
#endif
|
||||
|
||||
//The array of job_t may overflow the stack.
|
||||
//Instead, use malloc to alloc job_t.
|
||||
#if MAX_CPU_NUMBER > 210
|
||||
#define USE_ALLOC_HEAP
|
||||
#endif
|
||||
|
||||
#ifndef GEMM3M_LOCAL
|
||||
#if defined(NN)
|
||||
#define GEMM3M_LOCAL GEMM3M_NN
|
||||
|
@ -836,7 +842,11 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
|||
BLASLONG range_M[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_N[MAX_CPU_NUMBER + 1];
|
||||
|
||||
job_t job[MAX_CPU_NUMBER];
|
||||
#ifndef USE_ALLOC_HEAP
|
||||
job_t job[MAX_CPU_NUMBER];
|
||||
#else
|
||||
job_t * job = NULL;
|
||||
#endif
|
||||
|
||||
BLASLONG num_cpu_m, num_cpu_n;
|
||||
|
||||
|
@ -866,6 +876,15 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
|||
newarg.alpha = args -> alpha;
|
||||
newarg.beta = args -> beta;
|
||||
newarg.nthreads = args -> nthreads;
|
||||
|
||||
#ifdef USE_ALLOC_HEAP
|
||||
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
|
||||
if(job==NULL){
|
||||
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
|
||||
newarg.common = (void *)job;
|
||||
|
||||
if (!range_m) {
|
||||
|
@ -945,6 +964,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
|||
exec_blas(num_cpu_m, queue);
|
||||
}
|
||||
|
||||
#ifdef USE_ALLOC_HEAP
|
||||
free(job);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -48,6 +48,12 @@
|
|||
#define SWITCH_RATIO 2
|
||||
#endif
|
||||
|
||||
//The array of job_t may overflow the stack.
|
||||
//Instead, use malloc to alloc job_t.
|
||||
#if MAX_CPU_NUMBER > 210
|
||||
#define USE_ALLOC_HEAP
|
||||
#endif
|
||||
|
||||
#ifndef SYRK_LOCAL
|
||||
#if !defined(LOWER) && !defined(TRANS)
|
||||
#define SYRK_LOCAL SYRK_UN
|
||||
|
@ -502,7 +508,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
blas_arg_t newarg;
|
||||
|
||||
#ifndef USE_ALLOC_HEAP
|
||||
job_t job[MAX_CPU_NUMBER];
|
||||
#else
|
||||
job_t * job = NULL;
|
||||
#endif
|
||||
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
|
||||
BLASLONG range[MAX_CPU_NUMBER + 100];
|
||||
|
@ -556,6 +567,15 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
newarg.ldc = args -> ldc;
|
||||
newarg.alpha = args -> alpha;
|
||||
newarg.beta = args -> beta;
|
||||
|
||||
#ifdef USE_ALLOC_HEAP
|
||||
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
|
||||
if(job==NULL){
|
||||
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
|
||||
newarg.common = (void *)job;
|
||||
|
||||
if (!range_n) {
|
||||
|
@ -668,6 +688,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
exec_blas(num_cpu, queue);
|
||||
}
|
||||
|
||||
#ifdef USE_ALLOC_HEAP
|
||||
free(job);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -48,6 +48,12 @@
|
|||
#define SWITCH_RATIO 2
|
||||
#endif
|
||||
|
||||
//The array of job_t may overflow the stack.
|
||||
//Instead, use malloc to alloc job_t.
|
||||
#if MAX_CPU_NUMBER > 210
|
||||
#define USE_ALLOC_HEAP
|
||||
#endif
|
||||
|
||||
#ifndef GEMM_LOCAL
|
||||
#if defined(NN)
|
||||
#define GEMM_LOCAL GEMM_NN
|
||||
|
@ -531,7 +537,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
|||
|
||||
blas_arg_t newarg;
|
||||
|
||||
#ifndef USE_ALLOC_HEAP
|
||||
job_t job[MAX_CPU_NUMBER];
|
||||
#else
|
||||
job_t * job = NULL;
|
||||
#endif
|
||||
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
|
||||
BLASLONG range_M[MAX_CPU_NUMBER + 1];
|
||||
|
@ -575,6 +586,15 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
|||
newarg.alpha = args -> alpha;
|
||||
newarg.beta = args -> beta;
|
||||
newarg.nthreads = args -> nthreads;
|
||||
|
||||
#ifdef USE_ALLOC_HEAP
|
||||
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
|
||||
if(job==NULL){
|
||||
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
|
||||
newarg.common = (void *)job;
|
||||
|
||||
#ifdef PARAMTEST
|
||||
|
@ -660,6 +680,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
|||
exec_blas(num_cpu_m, queue);
|
||||
}
|
||||
|
||||
#ifdef USE_ALLOC_HEAP
|
||||
free(job);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -217,7 +217,7 @@ int get_num_procs(void) {
|
|||
}
|
||||
return nums;
|
||||
}
|
||||
|
||||
/*
|
||||
void set_stack_limit(int limitMB){
|
||||
int result=0;
|
||||
struct rlimit rl;
|
||||
|
@ -235,6 +235,7 @@ void set_stack_limit(int limitMB){
|
|||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
@ -1273,12 +1274,6 @@ void CONSTRUCTOR gotoblas_init(void) {
|
|||
#endif
|
||||
|
||||
#ifdef DYNAMIC_ARCH
|
||||
#if defined(SMP) && defined(OS_DARWIN) && MAX_CPU_NUMBER > 128
|
||||
//Set stack limit to 16MB on Mac OS X
|
||||
//when NUM_THREADS>128 and DYNAMIC_ARCH=1.
|
||||
//Prevent the SEGFAULT bug.
|
||||
set_stack_limit(16);
|
||||
#endif
|
||||
gotoblas_dynamic_init();
|
||||
#endif
|
||||
|
||||
|
|
|
@ -43,6 +43,12 @@ static FLOAT dm1 = -1.;
|
|||
|
||||
double sqrt(double);
|
||||
|
||||
//In this case, the recursive getrf_parallel may overflow the stack.
|
||||
//Instead, use malloc to alloc job_t.
|
||||
#if MAX_CPU_NUMBER > 90
|
||||
#define USE_ALLOC_HEAP
|
||||
#endif
|
||||
|
||||
#ifndef CACHE_LINE_SIZE
|
||||
#define CACHE_LINE_SIZE 8
|
||||
#endif
|
||||
|
@ -356,7 +362,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
|||
BLASLONG range_M[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_N[MAX_CPU_NUMBER + 1];
|
||||
|
||||
#ifndef USE_ALLOC_HEAP
|
||||
job_t job[MAX_CPU_NUMBER];
|
||||
#else
|
||||
job_t * job=NULL;
|
||||
#endif
|
||||
|
||||
BLASLONG width, nn, mm;
|
||||
BLASLONG i, j, k, is, bk;
|
||||
|
@ -401,7 +411,6 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
|||
|
||||
newarg.c = ipiv;
|
||||
newarg.lda = lda;
|
||||
newarg.common = (void *)job;
|
||||
|
||||
info = 0;
|
||||
|
||||
|
@ -427,6 +436,16 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
|||
|
||||
if (iinfo && !info) info = iinfo;
|
||||
|
||||
#ifdef USE_ALLOC_HEAP
|
||||
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
|
||||
if(job==NULL){
|
||||
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
|
||||
newarg.common = (void *)job;
|
||||
|
||||
TRSM_ILTCOPY(bk, bk, a, lda, 0, sb);
|
||||
|
||||
sbb = (FLOAT *)((((long)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
||||
|
@ -586,6 +605,10 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
|||
is += bk;
|
||||
}
|
||||
|
||||
#ifdef USE_ALLOC_HEAP
|
||||
free(job);
|
||||
#endif
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
|
|
|
@ -41,6 +41,13 @@
|
|||
|
||||
#ifndef USE_SIMPLE_THREADED_LEVEL3
|
||||
|
||||
//The array of job_t may overflow the stack.
|
||||
//Instead, use malloc to alloc job_t.
|
||||
#if MAX_CPU_NUMBER > 210
|
||||
#define USE_ALLOC_HEAP
|
||||
#endif
|
||||
|
||||
|
||||
static FLOAT dm1 = -1.;
|
||||
|
||||
#ifndef KERNEL_FUNC
|
||||
|
@ -342,7 +349,12 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
|
|||
|
||||
blas_arg_t newarg;
|
||||
|
||||
#ifndef USE_ALLOC_HEAP
|
||||
job_t job[MAX_CPU_NUMBER];
|
||||
#else
|
||||
job_t * job = NULL;
|
||||
#endif
|
||||
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
|
||||
BLASLONG range[MAX_CPU_NUMBER + 100];
|
||||
|
@ -387,6 +399,15 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
|
|||
newarg.c = args -> c;
|
||||
newarg.lda = args -> lda;
|
||||
newarg.alpha = args -> alpha;
|
||||
|
||||
#ifdef USE_ALLOC_HEAP
|
||||
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
|
||||
if(job==NULL){
|
||||
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
|
||||
newarg.common = (void *)job;
|
||||
|
||||
n_from = 0;
|
||||
|
@ -494,6 +515,10 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
|
|||
exec_blas(num_cpu, queue);
|
||||
}
|
||||
|
||||
#ifdef USE_ALLOC_HEAP
|
||||
free(job);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue