From c9d32674eaa2602184c2719dde15ac3fbebf41b7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 9 Aug 2020 19:17:04 +0200
Subject: [PATCH 1/7] Add memory barrier to the blas_lock implementation for
 Linux

as recommended by cparrott73 in #2760
---
 common_power.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/common_power.h b/common_power.h
index aa19794b5..e0685f760 100644
--- a/common_power.h
+++ b/common_power.h
@@ -105,6 +105,7 @@ static void INLINE blas_lock(volatile unsigned long *address){
 	   "	bne- 1f\n"
 	   "	stwcx. %2,0, %1\n"
 	   "	bne- 0b\n"
+	   "    isync\n"
 	   "1:    "
 	: "=&r"(ret)
 	: "r"(address), "r" (val)

From e2828e30aa5fc5670d0f4d4d42fc26649a4c3c64 Mon Sep 17 00:00:00 2001
From: Marius Hillenbrand <mhillen@linux.ibm.com>
Date: Tue, 11 Aug 2020 12:55:42 +0200
Subject: [PATCH 2/7] s390x: Optimize SGEMM/DGEMM blocks for z14 with explicit
 loop unrolling/interleaving

Improve performance of SGEMM and DGEMM on z14 and z15 by unrolling and
interleaving the inner loop of the SGEMM 16x4 and DGEMM 8x4 blocks.
Specifically, we explicitly interleave vector register loads and
computation of two iterations.

Note that this change only adds one C function, since SGEMM 16x4 and
DGEMM 8x4 actually map to the same C code: they both hold intermediate
results in a 4x4 grid of vector registers, and the C implementation is
built around that.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
---
 kernel/zarch/gemm_vec.c | 213 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 212 insertions(+), 1 deletion(-)

diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c
index eb6d7700b..eae2e4d69 100644
--- a/kernel/zarch/gemm_vec.c
+++ b/kernel/zarch/gemm_vec.c
@@ -249,7 +249,6 @@ static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
 
 
 #if UNROLL_M == 16
-VECTOR_BLOCK(16, 4)
 VECTOR_BLOCK(16, 2)
 VECTOR_BLOCK(16, 1)
 #endif
@@ -257,7 +256,9 @@ VECTOR_BLOCK(16, 1)
 VECTOR_BLOCK(8, 8)
 VECTOR_BLOCK(4, 8)
 #endif
+#ifndef DOUBLE
 VECTOR_BLOCK(8, 4)
+#endif
 VECTOR_BLOCK(8, 2)
 VECTOR_BLOCK(8, 1)
 VECTOR_BLOCK(4, 4)
@@ -267,8 +268,218 @@ VECTOR_BLOCK(4, 1)
 #ifdef DOUBLE
 VECTOR_BLOCK(2, 4)
 VECTOR_BLOCK(2, 2)
+VECTOR_BLOCK(2, 1)
 #endif
 
+
+/**
+ * Calculate a row-block that fits 4x4 vector registers using a loop
+ * unrolled-by-2 with explicit interleaving to better overlap loads and
+ * computation.
+ * This function fits 16x4 blocks for SGEMM and 8x4 blocks for DGEMM.
+ */
+#ifdef DOUBLE
+static inline void GEBP_block_8_4(
+#else // float
+static inline void GEBP_block_16_4(
+#endif
+    FLOAT const *restrict A, BLASLONG bk, FLOAT const *restrict B,
+    FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) {
+#define VEC_ROWS 4
+#define VEC_COLS 4
+#define ROWS VEC_ROWS * VLEN_FLOATS
+#define COLS (VEC_COLS)
+
+	/*
+	 * Hold intermediate results in vector registers.
+	 * Since we need to force the compiler's hand in places, we need to use
+	 * individual variables in contrast to the generic implementation's
+	 * arrays.
+	 */
+#define INIT_ROW_OF_C(ROW)                                        \
+    vector_float A##ROW = vec_load_hinted(A + ROW * VLEN_FLOATS); \
+    vector_float C_##ROW##_0 = A##ROW * B[0];                     \
+    vector_float C_##ROW##_1 = A##ROW * B[1];                     \
+    vector_float C_##ROW##_2 = A##ROW * B[2];                     \
+    vector_float C_##ROW##_3 = A##ROW * B[3];
+
+	INIT_ROW_OF_C(0)
+	INIT_ROW_OF_C(1)
+	INIT_ROW_OF_C(2)
+	INIT_ROW_OF_C(3)
+#undef INIT_ROW_OF_C
+
+	if (bk > 1) {
+		BLASLONG k = 1;
+		vector_float Ak[VEC_ROWS], Aknext[VEC_ROWS];
+		vector_float Bk[VEC_COLS], Bknext[VEC_COLS];
+
+		/*
+		 * Note that in several places, we enforce an instruction
+		 * sequence that we identified empirically by utilizing dummy
+		 * asm statements.
+		 */
+
+		for (BLASLONG j = 0; j < VEC_COLS; j++)
+			Bk[j] = vec_splats(B[j + k * COLS]);
+		asm("");
+
+		for (BLASLONG i = 0; i < VEC_ROWS; i++)
+			Ak[i] = vec_load_hinted(A + i * VLEN_FLOATS + k * ROWS);
+
+		for (; k < (bk - 2); k += 2) {
+			/*
+			 * Load inputs for (k+1) into registers.
+			 * Loading from B first is advantageous.
+			 */
+			for (BLASLONG j = 0; j < VEC_COLS; j++)
+				Bknext[j] = vec_splats(B[j + (k + 1) * COLS]);
+			asm("");
+			for (BLASLONG i = 0; i < VEC_ROWS; i++)
+				Aknext[i] = vec_load_hinted(A + i * VLEN_FLOATS +
+						(k + 1) * ROWS);
+
+			/*
+			 * To achieve better instruction-level parallelism,
+			 * make sure to first load input data for (k+1) before
+			 * initiating compute for k. We enforce that ordering
+			 * with a pseudo asm statement.
+			 * Note that we need to massage this particular "barrier"
+			 * depending on the gcc version.
+			 */
+#if __GNUC__ > 7
+#define BARRIER_READ_BEFORE_COMPUTE(SUFFIX)                                    \
+    do {                                                                       \
+	asm(""                                                                 \
+	    : "+v"(C_0_0), "+v"(C_0_1), "+v"(C_0_2), "+v"(C_0_3), "+v"(C_1_0), \
+	      "+v"(C_1_1), "+v"(C_1_2), "+v"(C_1_3)                            \
+	    : "v"(B##SUFFIX[0]), "v"(B##SUFFIX[1]), "v"(B##SUFFIX[2]),         \
+	      "v"(B##SUFFIX[3]), "v"(A##SUFFIX[0]), "v"(A##SUFFIX[1]),         \
+	      "v"(A##SUFFIX[2]), "v"(A##SUFFIX[3]));                           \
+	asm(""                                                                 \
+	    : "+v"(C_2_0), "+v"(C_2_1), "+v"(C_2_2), "+v"(C_2_3), "+v"(C_3_0), \
+	      "+v"(C_3_1), "+v"(C_3_2), "+v"(C_3_3)                            \
+	    : "v"(B##SUFFIX[0]), "v"(B##SUFFIX[1]), "v"(B##SUFFIX[2]),         \
+	      "v"(B##SUFFIX[3]), "v"(A##SUFFIX[0]), "v"(A##SUFFIX[1]),         \
+	      "v"(A##SUFFIX[2]), "v"(A##SUFFIX[3]));                           \
+    } while (0)
+#else // __GNUC__ <= 7
+#define BARRIER_READ_BEFORE_COMPUTE(SUFFIX) \
+    do {                                    \
+	asm("");                            \
+    } while (0)
+#endif
+
+			BARRIER_READ_BEFORE_COMPUTE(knext);
+
+			/* Compute for (k) */
+			C_0_0 += Ak[0] * Bk[0];
+			C_1_0 += Ak[1] * Bk[0];
+			C_2_0 += Ak[2] * Bk[0];
+			C_3_0 += Ak[3] * Bk[0];
+
+			C_0_1 += Ak[0] * Bk[1];
+			C_1_1 += Ak[1] * Bk[1];
+			C_2_1 += Ak[2] * Bk[1];
+			C_3_1 += Ak[3] * Bk[1];
+
+			C_0_2 += Ak[0] * Bk[2];
+			C_1_2 += Ak[1] * Bk[2];
+			C_2_2 += Ak[2] * Bk[2];
+			C_3_2 += Ak[3] * Bk[2];
+
+			C_0_3 += Ak[0] * Bk[3];
+			C_1_3 += Ak[1] * Bk[3];
+			C_2_3 += Ak[2] * Bk[3];
+			C_3_3 += Ak[3] * Bk[3];
+
+			asm("");
+
+			/*
+			 * Load inputs for (k+2) into registers.
+			 * First load from B.
+			 */
+			for (BLASLONG j = 0; j < VEC_COLS; j++)
+				Bk[j] = vec_splats(B[j + (k + 2) * COLS]);
+			asm("");
+			for (BLASLONG i = 0; i < VEC_ROWS; i++)
+				Ak[i] = vec_load_hinted(A + i * VLEN_FLOATS + (k + 2) * ROWS);
+
+			/*
+			 * As above, make sure to first schedule the loads for (k+2)
+			 * before compute for (k+1).
+			 */
+			BARRIER_READ_BEFORE_COMPUTE(k);
+
+			/* Compute on (k+1) */
+			C_0_0 += Aknext[0] * Bknext[0];
+			C_1_0 += Aknext[1] * Bknext[0];
+			C_2_0 += Aknext[2] * Bknext[0];
+			C_3_0 += Aknext[3] * Bknext[0];
+
+			C_0_1 += Aknext[0] * Bknext[1];
+			C_1_1 += Aknext[1] * Bknext[1];
+			C_2_1 += Aknext[2] * Bknext[1];
+			C_3_1 += Aknext[3] * Bknext[1];
+
+			C_0_2 += Aknext[0] * Bknext[2];
+			C_1_2 += Aknext[1] * Bknext[2];
+			C_2_2 += Aknext[2] * Bknext[2];
+			C_3_2 += Aknext[3] * Bknext[2];
+
+			C_0_3 += Aknext[0] * Bknext[3];
+			C_1_3 += Aknext[1] * Bknext[3];
+			C_2_3 += Aknext[2] * Bknext[3];
+			C_3_3 += Aknext[3] * Bknext[3];
+		}
+
+		/* Wrapup remaining k's */
+		for (; k < bk; k++) {
+			vector_float Ak;
+
+#define COMPUTE_WRAPUP_ROW(ROW)                             \
+    Ak = vec_load_hinted(A + ROW * VLEN_FLOATS + k * ROWS); \
+    C_##ROW##_0 += Ak * B[0 + k * COLS];                    \
+    C_##ROW##_1 += Ak * B[1 + k * COLS];                    \
+    C_##ROW##_2 += Ak * B[2 + k * COLS];                    \
+    C_##ROW##_3 += Ak * B[3 + k * COLS];
+
+			COMPUTE_WRAPUP_ROW(0)
+			COMPUTE_WRAPUP_ROW(1)
+			COMPUTE_WRAPUP_ROW(2)
+			COMPUTE_WRAPUP_ROW(3)
+#undef COMPUTE_WRAPUP_ROW
+		}
+	}
+
+	/*
+	 * Unpack row-block of C_aux into outer C_i, multiply by
+	 * alpha and add up (or assign for TRMM).
+	 */
+#define WRITE_BACK_C(ROW, COL)                                   \
+    do {                                                         \
+	vector_float *Cij =                                      \
+	    (vector_float *)(C + ROW * VLEN_FLOATS + COL * ldc); \
+	if (trmm) {                                              \
+	    *Cij = alpha * C_##ROW##_##COL;                   \
+	} else {                                                 \
+	    *Cij += alpha * C_##ROW##_##COL;                  \
+	}                                                        \
+    } while (0)
+
+	WRITE_BACK_C(0, 0); WRITE_BACK_C(0, 1); WRITE_BACK_C(0, 2); WRITE_BACK_C(0, 3);
+	WRITE_BACK_C(1, 0); WRITE_BACK_C(1, 1); WRITE_BACK_C(1, 2); WRITE_BACK_C(1, 3);
+	WRITE_BACK_C(2, 0); WRITE_BACK_C(2, 1); WRITE_BACK_C(2, 2); WRITE_BACK_C(2, 3);
+	WRITE_BACK_C(3, 0); WRITE_BACK_C(3, 1); WRITE_BACK_C(3, 2); WRITE_BACK_C(3, 3);
+#undef WRITE_BACK_C
+
+#undef ROWS
+#undef VEC_ROWS
+#undef COLS
+#undef VEC_COLS
+#undef BARRIER_READ_BEFORE_COMPUTE
+}
+
 /**
  * Handle calculation for row blocks in C_i of any size by dispatching into
  * macro-defined (inline) functions or by deferring to a simple generic

From 07c334e7be2f30a07263f0f827cb92fd257704dc Mon Sep 17 00:00:00 2001
From: Marius Hillenbrand <mhillen@linux.ibm.com>
Date: Tue, 11 Aug 2020 12:55:53 +0200
Subject: [PATCH 3/7] s390x: Factor out small block sizes for SGEMM/DGEMM on
 z14

For small register blockings that are too small to fill up vector
registers with column vectors, we currently use a generic code block.
Replace that with instantiations of the generic code as individual
functions, so that the compiler can optimize each one separately.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
---
 kernel/zarch/gemm_vec.c | 78 +++++++++++++++++++++++++++--------------
 1 file changed, 51 insertions(+), 27 deletions(-)

diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c
index eae2e4d69..741c09431 100644
--- a/kernel/zarch/gemm_vec.c
+++ b/kernel/zarch/gemm_vec.c
@@ -265,12 +265,58 @@ VECTOR_BLOCK(4, 4)
 VECTOR_BLOCK(4, 2)
 VECTOR_BLOCK(4, 1)
 
+/**
+ * Calculate for a row-block in C_i of size ROWSxCOLS using scalar operations.
+ * Simple implementation for smaller block sizes
+ *
+ * @param[in] 	A	Pointer current block of input matrix A.
+ * @param[in]	k	Number of columns in A.
+ * @param[in]	B	Pointer current block of input matrix B.
+ * @param[inout] C	Pointer current block of output matrix C.
+ * @param[in]	ldc	Offset between elements in adjacent columns in C.
+ * @param[in]	alpha	Scalar factor.
+ */
+#define SCALAR_BLOCK(ROWS, COLS)                                          \
+    static inline void GEBP_block_##ROWS##_##COLS(                        \
+	FLOAT const *restrict A, BLASLONG k, FLOAT const *restrict B,     \
+	FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) {                   \
+	FLOAT Caux[ROWS][COLS] __attribute__((aligned(16)));              \
+                                                                          \
+	/*                                                                \
+	 * Peel off first iteration (i.e., column of A) for               \
+	 * initializing Caux                                              \
+	 */                                                               \
+	for (BLASLONG i = 0; i < ROWS; i++)                               \
+	    for (BLASLONG j = 0; j < COLS; j++) Caux[i][j] = A[i] * B[j]; \
+                                                                          \
+	for (BLASLONG kk = 1; kk < k; kk++)                               \
+	    for (BLASLONG i = 0; i < ROWS; i++)                           \
+		for (BLASLONG j = 0; j < COLS; j++)                       \
+		    Caux[i][j] += A[i + kk * ROWS] * B[j + kk * COLS];    \
+                                                                          \
+	for (BLASLONG i = 0; i < ROWS; i++)                               \
+	    for (BLASLONG j = 0; j < COLS; j++)                           \
+		if (trmm) {                                               \
+		    C[i + j * ldc] = alpha * Caux[i][j];                  \
+		} else {                                                  \
+		    C[i + j * ldc] += alpha * Caux[i][j];                 \
+		}                                                         \
+    }
+
 #ifdef DOUBLE
 VECTOR_BLOCK(2, 4)
 VECTOR_BLOCK(2, 2)
 VECTOR_BLOCK(2, 1)
+#else
+SCALAR_BLOCK(2, 4)
+SCALAR_BLOCK(2, 2)
+SCALAR_BLOCK(2, 1)
 #endif
 
+SCALAR_BLOCK(1, 4)
+SCALAR_BLOCK(1, 2)
+SCALAR_BLOCK(1, 1)
+
 
 /**
  * Calculate a row-block that fits 4x4 vector registers using a loop
@@ -526,6 +572,8 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n,
 		}
 	}
 
+	/* Dispatch into the implementation for each block size: */
+
 #define BLOCK(bm, bn)                                           \
 	if (m == bm && n == bn) {                               \
 		GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \
@@ -541,35 +589,11 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n,
 	BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1);
 	BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1);
 
-	#ifdef DOUBLE
-	BLOCK(2, 4);
-	BLOCK(2, 2);
-	#endif
+	BLOCK(2, 4); BLOCK(2, 2); BLOCK(2, 1);
+
+	BLOCK(1, 4); BLOCK(1, 2); BLOCK(1, 1);
 
 #undef BLOCK
-
-	/* simple implementation for smaller block sizes: */
-	FLOAT Caux[m][n] __attribute__ ((aligned (16)));
-
-	/*
-	 * Peel off first iteration (i.e., column of A) for initializing Caux
-	 */
-	for (BLASLONG i = 0; i < m; i++)
-		for (BLASLONG j = 0; j < n; j++)
-			Caux[i][j] = A[i] * B[j];
-
-	for (BLASLONG kk = 1; kk < k; kk++)
-		for (BLASLONG i = 0; i < m; i++)
-			for (BLASLONG j = 0; j < n; j++)
-				Caux[i][j] += A[i + kk * m] * B[j + kk * n];
-
-	for (BLASLONG i = 0; i < m; i++)
-		for (BLASLONG j = 0; j < n; j++)
-			if (trmm) {
-				C[i + j * ldc] = alpha * Caux[i][j];
-			} else {
-				C[i + j * ldc] += alpha * Caux[i][j];
-			}
 }
 
 /**

From e115c97e05889fc2e8edf041cdfd92d00d63a884 Mon Sep 17 00:00:00 2001
From: Marius Hillenbrand <mhillen@linux.ibm.com>
Date: Tue, 11 Aug 2020 12:55:59 +0200
Subject: [PATCH 4/7] s390x/SGEMM: adjust default P and Q to multiples of M

We recently changed the register blocking for SGEMM on s390x to 16x4.
However, we did not adjust Q to a multiple of 16 and thus fell back to
the 8x4 kernel at each block's margin, without need. Adjust P and Q to
multiples of 16 to employ the faster 16x4 kernel for complete full-sized
blocks.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
---
 param.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/param.h b/param.h
index 476f237a1..3e539a2b8 100644
--- a/param.h
+++ b/param.h
@@ -3092,12 +3092,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define ZGEMM_DEFAULT_UNROLL_M  4
 #define ZGEMM_DEFAULT_UNROLL_N  4
 
-#define SGEMM_DEFAULT_P	456
+#define SGEMM_DEFAULT_P	480
 #define DGEMM_DEFAULT_P	320
 #define CGEMM_DEFAULT_P 480
 #define ZGEMM_DEFAULT_P 224
 
-#define SGEMM_DEFAULT_Q 488
+#define SGEMM_DEFAULT_Q 512
 #define DGEMM_DEFAULT_Q 384
 #define CGEMM_DEFAULT_Q 128
 #define ZGEMM_DEFAULT_Q 352

From fee361ae64f2d02552713ade0ee972e6efdb1ed4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 11 Aug 2020 13:27:19 +0200
Subject: [PATCH 5/7] fix another source of NO_CBLAS=0 surprise

---
 interface/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/Makefile b/interface/Makefile
index 44a9fdcf0..2dbd60073 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -367,7 +367,7 @@ CZBLAS3OBJS   +=  cblas_zgemm3m.$(SUFFIX)
 endif
 
 
-ifndef NO_CBLAS
+ifneq ($(NO_CBLAS), 1)
 
 override CFLAGS += -I.
 

From 619343278d6d6e8ec3989fb883da333ee087d351 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 11 Aug 2020 13:40:40 +0200
Subject: [PATCH 6/7] Fix mishandling of NO_CBLAS=0 and NO_LAPACKE=0

---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index c1d943fac..7a03b08f0 100644
--- a/Makefile
+++ b/Makefile
@@ -141,7 +141,7 @@ ifndef NO_FBLAS
 	$(MAKE) -C test all
 endif
 	$(MAKE) -C utest all
-ifndef NO_CBLAS
+ifneq ($(NO_CBLAS), 1)
 	$(MAKE) -C ctest all
 ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
 	$(MAKE) -C cpp_thread_test all
@@ -244,7 +244,7 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
 endif
-ifndef NO_LAPACKE
+ifneq ($(NO_LAPACKE), 1)
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib
 endif
 endif

From efdd237a91646f0ce58815ef6507c04e393813a6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 12 Aug 2020 23:08:38 +0200
Subject: [PATCH 7/7] Add a dedicated POWER9 build to the Travis CI (#2774)

* Add dedicated POWER9 build (using new syntax to ensure it runs as a P9-only containerized job rather than a VM that
might end up on P8 hardware half of the time)
* Bump gcc version for POWER9 build
---
 .travis.yml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index 101147353..307010e40 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -75,6 +75,23 @@ matrix:
         - TARGET_BOX=LINUX32
         - BTYPE="BINARY=32"
 
+    - os: linux
+      arch: ppc64le
+      dist: bionic
+      compiler: gcc
+      before_script:
+        - sudo add-apt-repository 'ppa:ubuntu-toolchain-r/test' -y
+        - sudo apt-get update
+        - sudo apt-get install gcc-9 gfortran-9 -y
+      script:
+        - make QUIET_MAKE=1  BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
+        - make -C test $COMMON_FLAGS $BTYPE
+        - make -C ctest $COMMON_FLAGS $BTYPE
+        - make -C utest $COMMON_FLAGS $BTYPE 
+      env:
+        # for matrix annotation only
+        - TARGET_BOX=PPC64LE_LINUX_P9
+
     - os: linux
       compiler: gcc
       addons: