From dcc5d6291e7b02761acfb6161c04ba1f8f25b502 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 1 Nov 2018 01:42:09 +0000 Subject: [PATCH 1/2] skylakex: Make the sgemm/dgemm beta code robust for a N=0 or M=0 case in the threading code there are cases where N or M can become 0, and the optimized beta code did not handle this well, leading to a crash during the audit for the crash a few edge conditions on the if statements were found and fixed as well --- kernel/x86_64/dgemm_beta_skylakex.c | 6 ++++-- kernel/x86_64/sgemm_beta_skylakex.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/dgemm_beta_skylakex.c b/kernel/x86_64/dgemm_beta_skylakex.c index 384e9f60b..6a824c9b5 100644 --- a/kernel/x86_64/dgemm_beta_skylakex.c +++ b/kernel/x86_64/dgemm_beta_skylakex.c @@ -55,6 +55,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, return 0; } + if (m == 0 || n == 0) + return 0; c_offset = c; @@ -69,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, i = m; - while (i > 32) { + while (i >= 32) { _mm512_storeu_pd(c_offset1, z_zero); _mm512_storeu_pd(c_offset1 + 8, z_zero); _mm512_storeu_pd(c_offset1 + 16, z_zero); @@ -77,7 +79,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset1 += 32; i -= 32; } - while (i > 8) { + while (i >= 8) { _mm512_storeu_pd(c_offset1, z_zero); c_offset1 += 8; i -= 8; diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index 54f9664e9..4e40acadf 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -55,6 +55,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, return 0; } + if (n == 0 || m == 0) + return; c_offset = c; @@ -71,13 +73,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, i = m; - while (i > 32) { + while (i >= 32) { _mm512_storeu_ps(c_offset1, z_zero); _mm512_storeu_ps(c_offset1 + 16, z_zero); c_offset1 += 32; i -= 32; } - while (i > 8) { + while (i >= 8) { _mm256_storeu_ps(c_offset1, y_zero); c_offset1 += 8; i -= 8; From 5b708e5eb1b17af9c45e0da2993da8a4756cb912 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 1 Nov 2018 01:43:20 +0000 Subject: [PATCH 2/2] sgemm/dgemm: add a way for an arch kernel to specify prefered sizes The current gemm threading code can make very unfortunate choices, for example on my 10 core system a 1024x1024x1024 matrix multiply ends up chunking into blocks of 102... which is not a vector friendly size and performance ends up horrible. this patch adds a helper define where an architecture can specify a preference for size multiples. This is different from existing defines that are minimum sizes and such. The performance increase with this patch for the 1024x1024x1024 sgemm is 2.3x (!!) --- driver/level3/level3_thread.c | 22 ++++++++++++++++++++++ param.h | 1 + 2 files changed, 23 insertions(+) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index aeb5e6ed4..de29247d4 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -48,6 +48,10 @@ #define SWITCH_RATIO 2 #endif +#ifndef GEMM_PREFERED_SIZE +#define GEMM_PREFERED_SIZE 1 +#endif + //The array of job_t may overflow the stack. //Instead, use malloc to alloc job_t. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD @@ -510,6 +514,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, return 0; } +static int round_up(int remainder, int width, int multiple) +{ + if (multiple > remainder || width <= multiple) + return width; + width = (width + multiple - 1) / multiple; + width = width * multiple; + return width; +} + + static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG nthreads_m, BLASLONG nthreads_n) { @@ -601,9 +615,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG num_parts = 0; while (m > 0){ width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts); + + width = round_up(m, width, GEMM_PREFERED_SIZE); + m -= width; + if (m < 0) width = width + m; range_M[num_parts + 1] = range_M[num_parts] + width; + num_parts ++; } for (i = num_parts; i < MAX_CPU_NUMBER; i++) { @@ -645,9 +664,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG if (width < SWITCH_RATIO) { width = SWITCH_RATIO; } + width = round_up(n, width, GEMM_PREFERED_SIZE); + n -= width; if (n < 0) width = width + n; range_N[num_parts + 1] = range_N[num_parts] + width; + num_parts ++; } for (j = num_parts; j < MAX_CPU_NUMBER; j++) { diff --git a/param.h b/param.h index e4ec1b2b5..d1b211584 100644 --- a/param.h +++ b/param.h @@ -1627,6 +1627,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 #define SWITCH_RATIO 32 +#define GEMM_PREFERED_SIZE 32 #ifdef ARCH_X86