sgemm/dgemm: add a way for an arch kernel to specify prefered sizes

The current gemm threading code can make very unfortunate choices, for
example on my 10 core system a 1024x1024x1024 matrix multiply ends up
chunking into blocks of 102... which is not a vector friendly size
and performance ends up horrible.

this patch adds a helper define where an architecture can specify
a preference for size multiples.
This is different from existing defines that are minimum sizes and such.

The performance increase with this patch for the 1024x1024x1024 sgemm
is 2.3x (!!)
This commit is contained in:
Arjan van de Ven 2018-11-01 01:43:20 +00:00
parent dcc5d6291e
commit 5b708e5eb1
2 changed files with 23 additions and 0 deletions

View File

@ -48,6 +48,10 @@
#define SWITCH_RATIO 2 #define SWITCH_RATIO 2
#endif #endif
#ifndef GEMM_PREFERED_SIZE
#define GEMM_PREFERED_SIZE 1
#endif
//The array of job_t may overflow the stack. //The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t. //Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@ -510,6 +514,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
return 0; return 0;
} }
static int round_up(int remainder, int width, int multiple)
{
if (multiple > remainder || width <= multiple)
return width;
width = (width + multiple - 1) / multiple;
width = width * multiple;
return width;
}
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
*range_n, FLOAT *sa, FLOAT *sb, *range_n, FLOAT *sa, FLOAT *sb,
BLASLONG nthreads_m, BLASLONG nthreads_n) { BLASLONG nthreads_m, BLASLONG nthreads_n) {
@ -601,9 +615,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
num_parts = 0; num_parts = 0;
while (m > 0){ while (m > 0){
width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts); width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
width = round_up(m, width, GEMM_PREFERED_SIZE);
m -= width; m -= width;
if (m < 0) width = width + m; if (m < 0) width = width + m;
range_M[num_parts + 1] = range_M[num_parts] + width; range_M[num_parts + 1] = range_M[num_parts] + width;
num_parts ++; num_parts ++;
} }
for (i = num_parts; i < MAX_CPU_NUMBER; i++) { for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
@ -645,9 +664,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
if (width < SWITCH_RATIO) { if (width < SWITCH_RATIO) {
width = SWITCH_RATIO; width = SWITCH_RATIO;
} }
width = round_up(n, width, GEMM_PREFERED_SIZE);
n -= width; n -= width;
if (n < 0) width = width + n; if (n < 0) width = width + n;
range_N[num_parts + 1] = range_N[num_parts] + width; range_N[num_parts + 1] = range_N[num_parts] + width;
num_parts ++; num_parts ++;
} }
for (j = num_parts; j < MAX_CPU_NUMBER; j++) { for (j = num_parts; j < MAX_CPU_NUMBER; j++) {

View File

@ -1627,6 +1627,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 8 #define SYMV_P 8
#define SWITCH_RATIO 32 #define SWITCH_RATIO 32
#define GEMM_PREFERED_SIZE 32
#ifdef ARCH_X86 #ifdef ARCH_X86