sgemm/dgemm: add a way for an arch kernel to specify prefered sizes
The current gemm threading code can make very unfortunate choices, for example on my 10 core system a 1024x1024x1024 matrix multiply ends up chunking into blocks of 102... which is not a vector friendly size and performance ends up horrible. this patch adds a helper define where an architecture can specify a preference for size multiples. This is different from existing defines that are minimum sizes and such. The performance increase with this patch for the 1024x1024x1024 sgemm is 2.3x (!!)
This commit is contained in:
parent
dcc5d6291e
commit
5b708e5eb1
|
@ -48,6 +48,10 @@
|
||||||
#define SWITCH_RATIO 2
|
#define SWITCH_RATIO 2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef GEMM_PREFERED_SIZE
|
||||||
|
#define GEMM_PREFERED_SIZE 1
|
||||||
|
#endif
|
||||||
|
|
||||||
//The array of job_t may overflow the stack.
|
//The array of job_t may overflow the stack.
|
||||||
//Instead, use malloc to alloc job_t.
|
//Instead, use malloc to alloc job_t.
|
||||||
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
|
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
|
||||||
|
@ -510,6 +514,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int round_up(int remainder, int width, int multiple)
|
||||||
|
{
|
||||||
|
if (multiple > remainder || width <= multiple)
|
||||||
|
return width;
|
||||||
|
width = (width + multiple - 1) / multiple;
|
||||||
|
width = width * multiple;
|
||||||
|
return width;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||||
*range_n, FLOAT *sa, FLOAT *sb,
|
*range_n, FLOAT *sa, FLOAT *sb,
|
||||||
BLASLONG nthreads_m, BLASLONG nthreads_n) {
|
BLASLONG nthreads_m, BLASLONG nthreads_n) {
|
||||||
|
@ -601,9 +615,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||||
num_parts = 0;
|
num_parts = 0;
|
||||||
while (m > 0){
|
while (m > 0){
|
||||||
width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
|
width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
|
||||||
|
|
||||||
|
width = round_up(m, width, GEMM_PREFERED_SIZE);
|
||||||
|
|
||||||
m -= width;
|
m -= width;
|
||||||
|
|
||||||
if (m < 0) width = width + m;
|
if (m < 0) width = width + m;
|
||||||
range_M[num_parts + 1] = range_M[num_parts] + width;
|
range_M[num_parts + 1] = range_M[num_parts] + width;
|
||||||
|
|
||||||
num_parts ++;
|
num_parts ++;
|
||||||
}
|
}
|
||||||
for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
|
for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
|
||||||
|
@ -645,9 +664,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||||
if (width < SWITCH_RATIO) {
|
if (width < SWITCH_RATIO) {
|
||||||
width = SWITCH_RATIO;
|
width = SWITCH_RATIO;
|
||||||
}
|
}
|
||||||
|
width = round_up(n, width, GEMM_PREFERED_SIZE);
|
||||||
|
|
||||||
n -= width;
|
n -= width;
|
||||||
if (n < 0) width = width + n;
|
if (n < 0) width = width + n;
|
||||||
range_N[num_parts + 1] = range_N[num_parts] + width;
|
range_N[num_parts + 1] = range_N[num_parts] + width;
|
||||||
|
|
||||||
num_parts ++;
|
num_parts ++;
|
||||||
}
|
}
|
||||||
for (j = num_parts; j < MAX_CPU_NUMBER; j++) {
|
for (j = num_parts; j < MAX_CPU_NUMBER; j++) {
|
||||||
|
|
Loading…
Reference in New Issue