From dcc5d6291e7b02761acfb6161c04ba1f8f25b502 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Thu, 1 Nov 2018 01:42:09 +0000
Subject: [PATCH 1/2] skylakex: Make the sgemm/dgemm beta code robust for a N=0
 or M=0 case

in the threading code there are cases where N or M can become 0,
and the optimized beta code did not handle this well, leading
to a crash

during the audit for the crash a few edge conditions on the if statements
were found and fixed as well
---
 kernel/x86_64/dgemm_beta_skylakex.c | 6 ++++--
 kernel/x86_64/sgemm_beta_skylakex.c | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/kernel/x86_64/dgemm_beta_skylakex.c b/kernel/x86_64/dgemm_beta_skylakex.c
index 384e9f60b..6a824c9b5 100644
--- a/kernel/x86_64/dgemm_beta_skylakex.c
+++ b/kernel/x86_64/dgemm_beta_skylakex.c
@@ -55,6 +55,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
 	return 0;
   }
 
+  if (m == 0 || n == 0)
+	return 0;
 
   c_offset = c;
 
@@ -69,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
 
       i = m;
 
-      while (i > 32) {
+      while (i >= 32) {
 	  _mm512_storeu_pd(c_offset1, z_zero);
 	  _mm512_storeu_pd(c_offset1 + 8, z_zero);
 	  _mm512_storeu_pd(c_offset1 + 16, z_zero);
@@ -77,7 +79,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
 	  c_offset1 += 32;
 	  i -= 32;
       }
-      while (i > 8) {
+      while (i >= 8) {
 	  _mm512_storeu_pd(c_offset1, z_zero);
 	  c_offset1 += 8;
 	  i -= 8;
diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c
index 54f9664e9..4e40acadf 100644
--- a/kernel/x86_64/sgemm_beta_skylakex.c
+++ b/kernel/x86_64/sgemm_beta_skylakex.c
@@ -55,6 +55,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
 	return 0;
   }
 
+  if (n == 0 || m == 0)
+	return;
 
   c_offset = c;
 
@@ -71,13 +73,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
 
       i = m;
 
-      while (i > 32) {
+      while (i >= 32) {
 	  _mm512_storeu_ps(c_offset1, z_zero);
 	  _mm512_storeu_ps(c_offset1 + 16, z_zero);
 	  c_offset1 += 32;
 	  i -= 32;
       }
-      while (i > 8) {
+      while (i >= 8) {
 	  _mm256_storeu_ps(c_offset1, y_zero);
 	  c_offset1 += 8;
 	  i -= 8;

From 5b708e5eb1b17af9c45e0da2993da8a4756cb912 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Thu, 1 Nov 2018 01:43:20 +0000
Subject: [PATCH 2/2] sgemm/dgemm: add a way for an arch kernel to specify
 prefered sizes

The current gemm threading code can make very unfortunate choices, for
example on my 10 core system a 1024x1024x1024 matrix multiply ends up
chunking into blocks of 102... which is not a vector friendly size
and performance ends up horrible.

this patch adds a helper define where an architecture can specify
a preference for size multiples.
This is different from existing defines that are minimum sizes and such.

The performance increase with this patch for the 1024x1024x1024 sgemm
is 2.3x (!!)
---
 driver/level3/level3_thread.c | 22 ++++++++++++++++++++++
 param.h                       |  1 +
 2 files changed, 23 insertions(+)

diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c
index aeb5e6ed4..de29247d4 100644
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -48,6 +48,10 @@
 #define SWITCH_RATIO 2
 #endif
 
+#ifndef GEMM_PREFERED_SIZE
+#define GEMM_PREFERED_SIZE 1
+#endif
+
 //The array of job_t may overflow the stack.
 //Instead, use malloc to alloc job_t.
 #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@@ -510,6 +514,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
   return 0;
 }
 
+static int round_up(int remainder, int width, int multiple)
+{
+	if (multiple > remainder || width <= multiple)
+		return width;
+	width = (width + multiple - 1) / multiple;
+	width = width * multiple;
+	return width;
+}
+
+
 static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
 		       *range_n, FLOAT *sa, FLOAT *sb,
                        BLASLONG nthreads_m, BLASLONG nthreads_n) {
@@ -601,9 +615,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
   num_parts = 0;
   while (m > 0){
     width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
+
+    width = round_up(m, width, GEMM_PREFERED_SIZE);
+
     m -= width;
+
     if (m < 0) width = width + m;
     range_M[num_parts + 1] = range_M[num_parts] + width;
+
     num_parts ++;
   }
   for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
@@ -645,9 +664,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
       if (width < SWITCH_RATIO) {
         width = SWITCH_RATIO;
       }
+      width = round_up(n, width, GEMM_PREFERED_SIZE);
+
       n -= width;
       if (n < 0) width = width + n;
       range_N[num_parts + 1] = range_N[num_parts] + width;
+
       num_parts ++;
     }
     for (j = num_parts; j < MAX_CPU_NUMBER; j++) {
diff --git a/param.h b/param.h
index e4ec1b2b5..d1b211584 100644
--- a/param.h
+++ b/param.h
@@ -1627,6 +1627,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SYMV_P  8
 
 #define SWITCH_RATIO	32
+#define GEMM_PREFERED_SIZE	32
 
 #ifdef ARCH_X86