Allow to do gemv and ger buffer allocation on the stack
ger and gemv call blas_memory_alloc/free which in their turn call blas_lock. blas_lock create thread contention when matrices are small and the number of thread is high enough. We avoid call blas_memory_alloc by replacing it with stack allocation. This can be enabled with: make -DMAX_STACK_ALLOC=2048 The given size (in byte) must be high enough to avoid thread contention and small enough to avoid stack overflow. Fix #478
This commit is contained in:
parent
cbb3ab80e7
commit
e9d9a8eae3
|
@ -305,6 +305,10 @@ ifdef SANITY_CHECK
|
||||||
CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU)
|
CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef MAX_STACK_ALLOC
|
||||||
|
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
|
||||||
|
endif
|
||||||
|
|
||||||
#
|
#
|
||||||
# Architecture dependent settings
|
# Architecture dependent settings
|
||||||
#
|
#
|
||||||
|
|
|
@ -208,7 +208,18 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
if (incx < 0) x -= (lenx - 1) * incx;
|
if (incx < 0) x -= (lenx - 1) * incx;
|
||||||
if (incy < 0) y -= (leny - 1) * incy;
|
if (incy < 0) y -= (leny - 1) * incy;
|
||||||
|
|
||||||
|
#ifdef MAX_STACK_ALLOC
|
||||||
|
int stack_alloc_size = m + n;
|
||||||
|
if(stack_alloc_size < 128)
|
||||||
|
//dgemv_n.S require a 128 bytes buffer
|
||||||
|
stack_alloc_size = 128;
|
||||||
|
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT))
|
||||||
|
stack_alloc_size = 0;
|
||||||
|
FLOAT stack_buffer[stack_alloc_size];
|
||||||
|
buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1);
|
||||||
|
#else
|
||||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
|
|
||||||
|
@ -237,7 +248,10 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
blas_memory_free(buffer);
|
#ifdef MAX_STACK_ALLOC
|
||||||
|
if(!stack_alloc_size)
|
||||||
|
#endif
|
||||||
|
blas_memory_free(buffer);
|
||||||
|
|
||||||
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);
|
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);
|
||||||
|
|
||||||
|
|
|
@ -171,7 +171,15 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
if (incy < 0) y -= (n - 1) * incy;
|
if (incy < 0) y -= (n - 1) * incy;
|
||||||
if (incx < 0) x -= (m - 1) * incx;
|
if (incx < 0) x -= (m - 1) * incx;
|
||||||
|
|
||||||
|
#ifdef MAX_STACK_ALLOC
|
||||||
|
int stack_alloc_size = m;
|
||||||
|
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT))
|
||||||
|
stack_alloc_size = 0;
|
||||||
|
FLOAT stack_buffer[stack_alloc_size];
|
||||||
|
buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1);
|
||||||
|
#else
|
||||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef SMPTEST
|
#ifdef SMPTEST
|
||||||
nthreads = num_cpu_avail(2);
|
nthreads = num_cpu_avail(2);
|
||||||
|
@ -190,7 +198,10 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
blas_memory_free(buffer);
|
#ifdef MAX_STACK_ALLOC
|
||||||
|
if(!stack_alloc_size)
|
||||||
|
#endif
|
||||||
|
blas_memory_free(buffer);
|
||||||
|
|
||||||
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);
|
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);
|
||||||
|
|
||||||
|
|
|
@ -302,7 +302,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
if ( n < 1 ) return(0);
|
if ( n < 1 ) return(0);
|
||||||
|
|
||||||
xbuffer = buffer;
|
xbuffer = buffer;
|
||||||
ytemp = buffer + NBMAX;
|
ytemp = buffer + (m < NBMAX ? m : NBMAX);
|
||||||
|
|
||||||
n0 = n / NBMAX;
|
n0 = n / NBMAX;
|
||||||
n1 = (n % NBMAX) >> 2 ;
|
n1 = (n % NBMAX) >> 2 ;
|
||||||
|
|
Loading…
Reference in New Issue