From cd10b35fe9133e44c3aa3a2c6d5712b10bf046bf Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 9 May 2020 13:42:33 +0200
Subject: [PATCH 01/14] Handle trailing spaces and empty condition variables

---
 cmake/utils.cmake | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 7a125ec55..1c21e776e 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -43,7 +43,8 @@ macro(ParseMakefileVars MAKEFILE_IN)
     if (NOT "${line_match}" STREQUAL "")
 #message(STATUS "match on ${line_match}")
       set(var_name ${CMAKE_MATCH_1})
-      set(var_value ${CMAKE_MATCH_2})
+#      set(var_value ${CMAKE_MATCH_2})
+      string(STRIP ${CMAKE_MATCH_2} var_value)
       # check for Makefile variables in the string, e.g. $(TSUFFIX)
       string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value})
       foreach (make_var ${make_var_matches})
@@ -63,7 +64,7 @@ macro(ParseMakefileVars MAKEFILE_IN)
         string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
         if (NOT "${line_match}" STREQUAL "")
 #          message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
-          if (${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})
+          if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})
 #            message (STATUS "condition is true")
             set (IfElse 1)
           else ()

From 58d26b4448a22cd1447d11c6fb746e2a28f8b573 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 9 May 2020 17:15:36 +0200
Subject: [PATCH 02/14] Correct ifort options

to same as suggested by reference-lapack
---
 Makefile.system | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.system b/Makefile.system
index 023546009..1f1ae8353 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -855,7 +855,7 @@ ifneq ($(INTERFACE64), 0)
 FCOMMON_OPT += -i8
 endif
 endif
-FCOMMON_OPT += -recursive
+FCOMMON_OPT += -recursive -fp-model strict -assume protect-parens
 ifeq ($(USE_OPENMP), 1)
 FCOMMON_OPT += -fopenmp
 endif

From 2271c3506b32f866eeffc3d46008fba68844fc72 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 9 May 2020 23:49:18 +0200
Subject: [PATCH 03/14] Work around excessive LAPACK test failures on Skylake-X

Something in the plain C parts of x86_64 cscal.c and zscal.c appears to be miscompiled by both gfortran9 and ifort when compiling for skylakex-avx512, even when the optimized Haswell microkernel is not in use.
---
 kernel/x86_64/KERNEL.SKYLAKEX | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index 65f031d03..448aee074 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -24,3 +24,6 @@ DGEMM_BETA = dgemm_beta_skylakex.c
 
 CGEMMKERNEL    =  cgemm_kernel_8x2_skylakex.c
 ZGEMMKERNEL    =  zgemm_kernel_4x2_skylakex.c
+
+CSCALKERNEL    = ../arm/zscal.c
+ZSCALKERNEL    = ../arm/zscal.c

From ce90e2bd3f6e6e0bb338472d69fad47633639505 Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Mon, 11 May 2020 09:57:46 -0500
Subject: [PATCH 04/14] Include shgemm in benchtest

This patch is to enable benchtest for half precision gemm
when BUILD_HALF is set during make.
---
 benchmark/Makefile | 20 ++++++++++++++++++--
 benchmark/gemm.c   | 13 ++++++++-----
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/benchmark/Makefile b/benchmark/Makefile
index 90d903ad7..53f422be4 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -49,6 +49,12 @@ else
 GOTO_LAPACK_TARGETS=
 endif
 
+ifeq ($(BUILD_HALF),1)
+GOTO_HALF_TARGETS=shgemm.goto
+else
+GOTO_HALF_TARGETS=
+endif
+
 ifeq ($(OSNAME), WINNT)
 
 goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
@@ -91,7 +97,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
        sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
        spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
        ssymm.goto dsymm.goto csymm.goto zsymm.goto \
-       saxpby.goto daxpby.goto caxpby.goto zaxpby.goto
+       saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS)
 
 acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
        scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
@@ -264,7 +270,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
        samin.goto damin.goto camin.goto zamin.goto \
        smin.goto dmin.goto \
        saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \
-       snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS)
+       snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS)
 
 acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
        scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
@@ -614,6 +620,11 @@ zcholesky.essl : zcholesky.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
 ##################################### Sgemm ####################################################
+ifeq ($(BUILD_HALF),1)
+shgemm.goto : shgemm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+endif
+
 sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
@@ -2916,6 +2927,11 @@ ccholesky.$(SUFFIX) : cholesky.c
 zcholesky.$(SUFFIX) : cholesky.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
 
+ifeq ($(BUILD_HALF),1)
+shgemm.$(SUFFIX) : gemm.c
+	$(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^
+endif
+
 sgemm.$(SUFFIX) : gemm.c
 	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
 
diff --git a/benchmark/gemm.c b/benchmark/gemm.c
index dd016a7c3..d2235330b 100644
--- a/benchmark/gemm.c
+++ b/benchmark/gemm.c
@@ -39,6 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #ifdef DOUBLE
 #define GEMM   BLASFUNC(dgemm)
+#elif defined(HALF)
+#define GEMM   BLASFUNC(shgemm)
 #else
 #define GEMM   BLASFUNC(sgemm)
 #endif
@@ -120,7 +122,8 @@ static void *huge_malloc(BLASLONG size){
 
 int main(int argc, char *argv[]){
 
-  FLOAT *a, *b, *c;
+  IFLOAT *a, *b;
+  FLOAT *c;
   FLOAT alpha[] = {1.0, 0.0};
   FLOAT beta [] = {0.0, 0.0};
   char transa = 'N';
@@ -184,10 +187,10 @@ int main(int argc, char *argv[]){
     k = to;
   }
 
-  if (( a = (FLOAT *)malloc(sizeof(FLOAT) * m * k * COMPSIZE)) == NULL) {
+  if (( a = (IFLOAT *)malloc(sizeof(IFLOAT) * m * k * COMPSIZE)) == NULL) {
     fprintf(stderr,"Out of Memory!!\n");exit(1);
   }
-  if (( b = (FLOAT *)malloc(sizeof(FLOAT) * k * n * COMPSIZE)) == NULL) {
+  if (( b = (IFLOAT *)malloc(sizeof(IFLOAT) * k * n * COMPSIZE)) == NULL) {
     fprintf(stderr,"Out of Memory!!\n");exit(1);
   }
   if (( c = (FLOAT *)malloc(sizeof(FLOAT) * m * n * COMPSIZE)) == NULL) {
@@ -199,10 +202,10 @@ int main(int argc, char *argv[]){
 #endif
 
   for (i = 0; i < m * k * COMPSIZE; i++) {
-    a[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+    a[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5;
   }
   for (i = 0; i < k * n * COMPSIZE; i++) {
-    b[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+    b[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5;
   }
   for (i = 0; i < m * n * COMPSIZE; i++) {
     c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;

From 8efba9b7c036783e0c2449ab58c50739381746d5 Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Mon, 11 May 2020 17:15:10 -0500
Subject: [PATCH 05/14] Improve shgemm test

This patch adds another check to test shgemm results.
---
 test/compare_sgemm_shgemm.c | 58 +++++++++++++++++++++++++++++--------
 1 file changed, 46 insertions(+), 12 deletions(-)

diff --git a/test/compare_sgemm_shgemm.c b/test/compare_sgemm_shgemm.c
index d5bd84b91..7e254f844 100644
--- a/test/compare_sgemm_shgemm.c
+++ b/test/compare_sgemm_shgemm.c
@@ -46,6 +46,27 @@ typedef union
   } bits;
 } bfloat16_bits;
 
+typedef union
+{
+  float v;
+  struct
+  {
+    uint32_t m:23;
+    uint32_t e:8;
+    uint32_t s:1;
+  } bits;
+} float32_bits;
+
+float
+float16to32 (bfloat16_bits f16)
+{
+  float32_bits f32;
+  f32.bits.s = f16.bits.s;
+  f32.bits.e = f16.bits.e;
+  f32.bits.m = (uint32_t) f16.bits.m << 16;
+  return f32.v;
+}
+
 int
 main (int argc, char *argv[])
 {
@@ -55,8 +76,6 @@ main (int argc, char *argv[])
   int loop = 100;
   char transA = 'N', transB = 'N';
   float alpha = 1.0, beta = 0.0;
-  char transa = 'N';
-  char transb = 'N';
 
   for (int x = 0; x <= loop; x++)
     {
@@ -65,30 +84,45 @@ main (int argc, char *argv[])
       float B[k * n];
       float C[m * n];
       bfloat16_bits AA[m * k], BB[k * n];
-      float CC[m * n];
+      float DD[m * n], CC[m * n];
 
       for (int j = 0; j < m; j++)
 	{
 	  for (int i = 0; i < m; i++)
 	    {
-	      A[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5;
-	      B[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5;
+	      A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
+	      B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
 	      C[j * k + i] = 0;
 	      AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16;
 	      BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16;
 	      CC[j * k + i] = 0;
+	      DD[j * k + i] = 0;
 	    }
 	}
       SGEMM (&transA, &transB, &m, &n, &k, &alpha, A,
-	      &m, B, &k, &beta, C, &m);
+	     &m, B, &k, &beta, C, &m);
       SHGEMM (&transA, &transB, &m, &n, &k, &alpha, AA,
-	       &m, BB, &k, &beta, CC, &m);
-
+	      &m, BB, &k, &beta, CC, &m);
       for (i = 0; i < n; i++)
-        for (j = 0; j < m; j++)
-          for (l = 0; l < k; l++)
-            if (fabs(CC[i * m + j]-C[i * m + j]) > 1.0)
-              ret++;
+	for (j = 0; j < m; j++)
+	  for (l = 0; l < k; l++)
+	    if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0)
+	      ret++;
+      if (transA == 'N' && transB == 'N')
+	{
+	  for (i = 0; i < n; i++)
+	    for (j = 0; j < m; j++)
+	      for (l = 0; l < k; l++)
+		{
+		  DD[i * m + j] +=
+		    float16to32 (AA[l * m + j]) * float16to32 (BB[l + k * i]);
+		}
+	  for (i = 0; i < n; i++)
+	    for (j = 0; j < m; j++)
+	      for (l = 0; l < k; l++)
+		if (CC[i * m + j] != DD[i * m + j])
+		  ret++;
+	}
     }
   if (ret != 0)
     fprintf (stderr, "FATAL ERROR SHGEMM - Return code: %d\n", ret);

From 8c338616f907b0592f0f59f1e4a365c7b000bc9d Mon Sep 17 00:00:00 2001
From: Marius Hillenbrand <mhillen@linux.ibm.com>
Date: Mon, 11 May 2020 12:37:21 +0200
Subject: [PATCH 06/14] s390x: gate dynamic arch detection on gcc version and
 add generic

When building OpenBLAS with DYNAMIC_ARCH=1 on s390x (aka zarch), make
sure to include support for systems without the facilities introduced
with z13 (i.e., zarch_generic). Adjust runtime detection to fallback to
that generic code when running on a unknown platform other than Z13
through Z15.

When detecting a Z13 or newer system, add a check for gcc support for
the architecture-specific features before selecting the respective
kernel. Fallback to Z13 or generic code, in case.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
---
 Makefile.system               |  3 +-
 driver/others/dynamic_zarch.c | 70 +++++++++++++++++++++++------------
 2 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index 1f1ae8353..111fc717b 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -563,7 +563,8 @@ DYNAMIC_CORE += EMAG8180
 endif
 
 ifeq ($(ARCH), zarch)
-DYNAMIC_CORE = Z13
+DYNAMIC_CORE = ZARCH_GENERIC
+DYNAMIC_CORE += Z13
 DYNAMIC_CORE += Z14
 endif
 
diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c
index 90d3051b1..8bcfcd004 100644
--- a/driver/others/dynamic_zarch.c
+++ b/driver/others/dynamic_zarch.c
@@ -1,12 +1,25 @@
-
 #include "common.h"
+#include <stdbool.h>
 
+// Gate kernels for z13 and z14 on gcc version
+#if (__GNUC__ == 5 && __GNUC_MINOR__ >= 2) || __GNUC__ >= 6 ||           \
+    /* RHEL 7 since 7.3: */                                              \
+    (__GNUC__ == 4 && __GNUC_MINOR__ == 8 && __GNUC_PATCHLEVEL__ == 5 && \
+     __GNUC_RH_RELEASE__ >= 11)
+#define HAVE_Z13_SUPPORT
+#endif
+
+#if __GNUC__ >= 7
+#define HAVE_Z14_SUPPORT
+#endif
+
+extern gotoblas_t gotoblas_ZARCH_GENERIC;
+#ifdef HAVE_Z13_SUPPORT
 extern gotoblas_t gotoblas_Z13;
+#endif
+#ifdef HAVE_Z14_SUPPORT
 extern gotoblas_t gotoblas_Z14;
-//extern gotoblas_t gotoblas_Z15;
-//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
-//extern gotoblas_t gotoblas_Z14;
-//#endif
+#endif
 
 #define NUM_CORETYPES 4
 
@@ -16,18 +29,19 @@ static char* corename[] = {
 	"unknown",
 	"Z13",
 	"Z14",
-//	"Z15",
 	"ZARCH_GENERIC",
 };
 
 char* gotoblas_corename(void) {
+#ifdef HAVE_Z13_SUPPORT
 	if (gotoblas == &gotoblas_Z13)	return corename[1];
+#endif
+#ifdef HAVE_Z14_SUPPORT
 	if (gotoblas == &gotoblas_Z14)	return corename[2];
-//	if (gotoblas == &gotoblas_Z15)	return corename[3];
-//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
-//	if (gotoblas == &gotoblas_POWER9)	return corename[3];
-//#endif
-	return corename[0]; // try generic?
+#endif
+	if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3];
+
+	return corename[0];
 }
 
 // __builtin_cpu_is is not supported by zarch
@@ -49,14 +63,21 @@ static gotoblas_t* get_coretype(void) {
 
 	fclose(infile);
 
-	if (strstr(p, "2964")) return &gotoblas_Z13;
-	if (strstr(p, "2965")) return &gotoblas_Z13;
-	if (strstr(p, "3906")) return &gotoblas_Z14;
-	if (strstr(p, "3907")) return &gotoblas_Z14;
-	if (strstr(p, "8561")) return &gotoblas_Z14;        // fallback z15 to z14
-	if (strstr(p, "8562")) return &gotoblas_Z14;        // fallback z15 to z14
+#ifdef HAVE_Z13_SUPPORT
+	if (strstr(p, "2964") || strstr(p, "2965")) return &gotoblas_Z13;
+#endif
 
-	return NULL; // should be ZARCH_GENERIC
+	// Z14 and Z15 systems
+	if (strstr(p, "3906") || strstr(p, "3907") || strstr(p, "8561") ||
+	    strstr(p, "8562"))
+#ifdef HAVE_Z14_SUPPORT
+		return &gotoblas_Z14;
+#else
+		return &gotoblas_Z13;
+#endif
+
+	// unknown system or compiler too old? use generic code for z architecture
+	return &gotoblas_ZARCH_GENERIC;
 }
 
 static gotoblas_t* force_coretype(char* coretype) {
@@ -76,12 +97,13 @@ static gotoblas_t* force_coretype(char* coretype) {
 
 	switch (found)
 	{
+#ifdef HAVE_Z13_SUPPORT
 	case  1: return (&gotoblas_Z13);
+#endif
+#ifdef HAVE_Z14_SUPPORT
 	case  2: return (&gotoblas_Z14);
-//	case  3: return (&gotoblas_Z15);
-//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
-//	case  3: return (&gotoblas_POWER9);
-//#endif
+#endif
+	case  3: return (&gotoblas_ZARCH_GENERIC);
 	default: return NULL;
 	}
 	snprintf(message, 128, "Core not found: %s\n", coretype);
@@ -109,9 +131,9 @@ void gotoblas_dynamic_init(void) {
 
 	if (gotoblas == NULL)
 	{
-		snprintf(coremsg, 128, "Falling back to Z14 core\n");
+		snprintf(coremsg, 128, "Failed to detect system, falling back to generic z support.\n");
 		openblas_warning(1, coremsg);
-		gotoblas = &gotoblas_Z14;
+		gotoblas = &gotoblas_ZARCH_GENERIC;
 	}
 
 	if (gotoblas && gotoblas->init) {

From 62cf391cbbf5ebdec5dc44e814797c6298e626bc Mon Sep 17 00:00:00 2001
From: Marius Hillenbrand <mhillen@linux.ibm.com>
Date: Mon, 11 May 2020 18:37:04 +0200
Subject: [PATCH 07/14] s390x: only build kernels supported by gcc with dynamic
 arch support

When building with dynamic arch support, only build kernels for
architectures that are supported by the gcc we are building with.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
---
 Makefile.system | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index 111fc717b..98d9ae313 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -564,8 +564,26 @@ endif
 
 ifeq ($(ARCH), zarch)
 DYNAMIC_CORE = ZARCH_GENERIC
+
+# Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer
+GCC_GE_52 := $(subst 0,,$(shell expr `$(CC) -dumpversion` \>= "5.2"))
+
+ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release)
+RHEL_WITH_Z13 := $(subst 0,,$(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"))
+endif
+
+ifeq ($(or $(GCC_GE_52),$(RHEL_WITH_Z13)), 1)
 DYNAMIC_CORE += Z13
+else
+$(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x)
+endif
+
+GCC_MAJOR_GE_7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
+ifeq ($(GCC_MAJOR_GE_7), 1)
 DYNAMIC_CORE += Z14
+else
+$(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x)
+endif
 endif
 
 ifeq ($(ARCH), power)

From 0dbe61a612708c1a689835dcf5fdb76b166e7729 Mon Sep 17 00:00:00 2001
From: Marius Hillenbrand <mhillen@linux.ibm.com>
Date: Mon, 11 May 2020 13:00:10 +0200
Subject: [PATCH 08/14] s390x: choose SIMD kernels at run-time based on OS and
 compiler support

Extend and simplify the run-time detection for dynamic architecture support for z
to check HW_CAP and only use SIMD features if advertised by the OS.
While at it, also honor the env variable LD_HWCAP_MASK and do not use
the CPU features masked there.

Note that we can only use the SIMD features on z13 or newer (i.e.,
Vector Facility or Vector-Enhancements Facilities) when the operating
system supports properly context-switching the vector registers. The OS
advertises that support as a bit in the HW_CAP value in the auxiliary
vector. While all recent Linux kernels have that support, we should
maintain compatibility with older versions that may still be in use.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
---
 driver/others/dynamic_zarch.c | 78 ++++++++++++++++++++++++-----------
 1 file changed, 53 insertions(+), 25 deletions(-)

diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c
index 8bcfcd004..403b34111 100644
--- a/driver/others/dynamic_zarch.c
+++ b/driver/others/dynamic_zarch.c
@@ -13,6 +13,39 @@
 #define HAVE_Z14_SUPPORT
 #endif
 
+// Guard the use of getauxval() on glibc version >= 2.16
+#ifdef __GLIBC__
+#include <features.h>
+#if __GLIBC_PREREQ(2, 16)
+#include <sys/auxv.h>
+#define HAVE_GETAUXVAL 1
+
+static unsigned long get_hwcap(void)
+{
+	unsigned long hwcap = getauxval(AT_HWCAP);
+	char *maskenv;
+
+	// honor requests for not using specific CPU features in LD_HWCAP_MASK
+	maskenv = getenv("LD_HWCAP_MASK");
+	if (maskenv)
+		hwcap &= strtoul(maskenv, NULL, 0);
+
+	return hwcap;
+	// note that a missing auxval is interpreted as no capabilities
+	// available, which is safe.
+}
+
+#else // __GLIBC_PREREQ(2, 16)
+#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16"
+
+static unsigned long get_hwcap(void) {
+	// treat missing support for getauxval() as no capabilities available,
+	// which is safe.
+	return 0;
+}
+#endif // __GLIBC_PREREQ(2, 16)
+#endif // __GLIBC
+
 extern gotoblas_t gotoblas_ZARCH_GENERIC;
 #ifdef HAVE_Z13_SUPPORT
 extern gotoblas_t gotoblas_Z13;
@@ -44,39 +77,34 @@ char* gotoblas_corename(void) {
 	return corename[0];
 }
 
-// __builtin_cpu_is is not supported by zarch
+/**
+ * Detect the fitting set of kernels by retrieving the CPU features supported by
+ * OS from the auxiliary value AT_HWCAP and choosing the set of kernels
+ * ("coretype") that exploits most of the features and can be compiled with the
+ * available gcc version.
+ * Note that we cannot use vector registers on a z13 or newer unless supported
+ * by the OS kernel (which needs to handle them properly during context switch).
+ */
 static gotoblas_t* get_coretype(void) {
-	FILE* infile;
-	char buffer[512], * p;
 
-	p = (char*)NULL;
-	infile = fopen("/proc/sysinfo", "r");
-	while (fgets(buffer, sizeof(buffer), infile)) {
-		if (!strncmp("Type", buffer, 4)) {
-			p = strchr(buffer, ':') + 2;
-#if 0
-			fprintf(stderr, "%s\n", p);
-#endif
-			break;
-		}
-	}
+	unsigned long hwcap __attribute__((unused)) = get_hwcap();
 
-	fclose(infile);
-
-#ifdef HAVE_Z13_SUPPORT
-	if (strstr(p, "2964") || strstr(p, "2965")) return &gotoblas_Z13;
-#endif
-
-	// Z14 and Z15 systems
-	if (strstr(p, "3906") || strstr(p, "3907") || strstr(p, "8561") ||
-	    strstr(p, "8562"))
+	// z14 and z15 systems: exploit Vector Facility (SIMD) and
+	// Vector-Enhancements Facility 1 (float SIMD instructions), if present.
 #ifdef HAVE_Z14_SUPPORT
+	if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE))
 		return &gotoblas_Z14;
-#else
+#endif
+
+	// z13: Vector Facility (SIMD for double)
+#ifdef HAVE_Z13_SUPPORT
+	if (hwcap & HWCAP_S390_VX)
 		return &gotoblas_Z13;
 #endif
 
-	// unknown system or compiler too old? use generic code for z architecture
+	// fallback in case of missing compiler support, systems before z13, or
+	// when the OS does not advertise support for the Vector Facility (e.g.,
+	// missing support in the OS kernel)
 	return &gotoblas_ZARCH_GENERIC;
 }
 

From d7c1677c20c326d4bf0f2cefc2c7ce36f7df3149 Mon Sep 17 00:00:00 2001
From: Marius Hillenbrand <mhillen@linux.ibm.com>
Date: Tue, 12 May 2020 11:09:28 +0200
Subject: [PATCH 09/14] Update CONTRIBUTORS.md, adding myself

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
---
 CONTRIBUTORS.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 6d18047fb..738475a93 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -183,4 +183,6 @@ In chronological order:
 
 * Rajalakshmi Srinivasaraghavan <https://github.com/RajalakshmiSR>
   * [2020-04-15] Half-precision GEMM for bfloat16
-  
+
+* Marius Hillenbrand <https://github.com/mhillenibm>
+  * [2020-05-12] Revise dynamic architecture detection for IBM z

From 43c0d4f312ba3cd1a0ff8f389e6eded98113c0dd Mon Sep 17 00:00:00 2001
From: Marius Hillenbrand <mhillen@linux.ibm.com>
Date: Tue, 12 May 2020 14:13:54 +0200
Subject: [PATCH 10/14] s390x: Add vectorized sgemm kernel for Z14 and newer

Add a new GEMM kernel implementation to exploit the FP32 SIMD
operations introduced with z14 and employ it for SGEMM on z14 and newer
architectures.

The SIMD extensions introduced with z13 support operations on
double-sized scalars in vector registers. Thus, the existing SGEMM code
would extend floats to doubles before operating on them. z14 extended
SIMD support to operations on 32-bit floats. By employing these
instructions, we can operate on twice the number of scalars per
instruction (four floats in each vector registers) and avoid the
conversion operations.

The code is written in C with explicit vectorization. In experiments,
this kernel improves performance on z14 and z15 by around 2x over the
current implementation in assembly. The flexibilty of the C code paves
the way for adjustments in subsequent commits.

Tested via make -C test / ctest / utest and by a couple of additional
unit tests that exercise blocking (e.g., partial register blocks with
fewer than UNROLL_M rows and/or fewer than UNROLL_N columns).

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
---
 Makefile.zarch          |   2 +-
 kernel/zarch/KERNEL.Z14 |   4 +-
 kernel/zarch/gemm_vec.c | 342 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 345 insertions(+), 3 deletions(-)
 create mode 100644 kernel/zarch/gemm_vec.c

diff --git a/Makefile.zarch b/Makefile.zarch
index 47ea1eb71..be1e34f6d 100644
--- a/Makefile.zarch
+++ b/Makefile.zarch
@@ -5,6 +5,6 @@ FCOMMON_OPT += -march=z13 -mzvector
 endif
 
 ifeq ($(CORE), Z14)
-CCOMMON_OPT += -march=z14 -mzvector
+CCOMMON_OPT += -march=z14 -mzvector -O3
 FCOMMON_OPT += -march=z14 -mzvector
 endif
diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14
index f6e3bec23..bd3a966b1 100644
--- a/kernel/zarch/KERNEL.Z14
+++ b/kernel/zarch/KERNEL.Z14
@@ -91,7 +91,7 @@ DTRMMKERNEL	= trmm8x4V.S
 CTRMMKERNEL	= ctrmm4x4V.S
 ZTRMMKERNEL	= ztrmm4x4V.S
 
-SGEMMKERNEL    =  strmm8x4V.S
+SGEMMKERNEL    = gemm_vec.c
 SGEMMINCOPY    = ../generic/gemm_ncopy_8.c
 SGEMMITCOPY    = ../generic/gemm_tcopy_8.c
 SGEMMONCOPY    = ../generic/gemm_ncopy_4.c
@@ -102,7 +102,7 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 
- 
+
 DGEMMKERNEL    =  gemm8x4V.S
 DGEMMINCOPY    = ../generic/gemm_ncopy_8.c
 DGEMMITCOPY    = ../generic/gemm_tcopy_8.c
diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c
new file mode 100644
index 000000000..e6d613c44
--- /dev/null
+++ b/kernel/zarch/gemm_vec.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) IBM Corporation 2020.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *    3. Neither the name of the OpenBLAS project nor the names of
+ *       its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "common.h"
+#include <vecintrin.h>
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef COMPLEX
+#error "Handling for complex numbers is not supported in this kernel"
+#endif
+
+#ifdef DOUBLE
+#define UNROLL_M DGEMM_DEFAULT_UNROLL_M
+#define UNROLL_N DGEMM_DEFAULT_UNROLL_N
+#else
+#define UNROLL_M SGEMM_DEFAULT_UNROLL_M
+#define UNROLL_N SGEMM_DEFAULT_UNROLL_N
+#endif
+
+static const size_t unroll_m = UNROLL_M;
+static const size_t unroll_n = UNROLL_N;
+
+/*
+ * Background:
+ *
+ * The algorithm of GotoBLAS / OpenBLAS breaks down the matrix multiplication
+ * problem by splitting all matrices into partitions multiple times, so that the
+ * submatrices fit into the L1 or L2 caches. As a result, each multiplication of
+ * submatrices can stream data fast from L1 and L2 caches. Inbetween, it copies
+ * and rearranges the submatrices to enable contiguous memory accesses to
+ * improve locality in both caches and TLBs.
+ *
+ * At the heart of the algorithm is this kernel, which multiplies, a "Block
+ * matrix" A (small dimensions) with a "Panel matrix" B (number of rows is
+ * small) and adds the result into a "Panel matrix" C; GotoBLAS calls this
+ * operation GEBP. This kernel further partitions GEBP twice, such that (1)
+ * submatrices of C and B fit into the L1 caches (GEBP_column_block) and (2) a
+ * block of C fits into the registers, while multiplying panels from A and B
+ * streamed from the L2 and L1 cache, respectively (GEBP_block).
+ *
+ *
+ * Algorithm GEBP(A, B, C, m, n, k, alpha):
+ *
+ * The problem is calculating C += alpha * (A * B)
+ * C is an m x n matrix, A is an m x k matrix, B is an k x n matrix.
+ *
+ * - C is in column-major-order, with an offset of ldc to the element in the
+ *   next column (same row).
+ * - A is in row-major-order yet stores SGEMM_UNROLL_M elements of each column
+ *   contiguously while walking along rows.
+ * - B is in column-major-order but packs SGEMM_UNROLL_N elements of a row
+ *   contiguously.
+ * If the numbers of rows and columns are not multiples of SGEMM_UNROLL_M or
+ * SGEMM_UNROLL_N, the remaining elements are arranged in blocks with power-of-2
+ * dimensions (e.g., 5 remaining columns would be in a block-of-4 and a
+ * block-of-1).
+ *
+ * Note that packing A and B into that form is taken care of by the caller in
+ * driver/level3/level3.c (actually done by "copy kernels").
+ *
+ * Steps:
+ * - Partition C and B into blocks of n_r (SGEMM_UNROLL_N) columns, C_j and B_j.
+ *   Now, B_j should fit into the L1 cache.
+ * - For each partition, calculate C_j += alpha * (A * B_j) by
+ *     (1) Calculate C_aux := A * B_j (see below)
+ *     (2) unpack C_j = C_j + alpha * C_aux
+ *
+ *
+ * Algorithm for Calculating C_aux:
+ *
+ * - Further partition C_aux and A into groups of m_r (SGEMM_UNROLL_M) rows,
+ *   such that the m_r x n_r-submatrix of C_aux can be held in registers. Each
+ *   submatrix of C_aux can be calculated independently, and the registers are
+ *   added back into C_j.
+ *
+ * - For each row-block of C_aux:
+ *   (uses a row block of A and full B_j)
+ *    - stream over all columns of A, multiply with elements from B and
+ *      accumulate in registers. (use different inner-kernels to exploit
+ *      vectorization for varying block sizes)
+ *    - add alpha * row block of C_aux back into C_j.
+ *
+ * Reference:
+ *
+ * The summary above is based on staring at various kernel implementations and:
+ * K. Goto and R. A. Van de Geijn, Anatomy of High-Performance Matrix
+ * Multiplication, in ACM Transactions of Mathematical Software, Vol.  34, No.
+ * 3, May 2008.
+ */
+
+#define VLEN_BYTES 16
+#define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT))
+
+typedef FLOAT vector_float __attribute__ ((vector_size (16)));
+
+/**
+ * Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics.
+ *
+ * @param[in] 	A	Pointer current block of input matrix A.
+ * @param[in]	k	Number of columns in A.
+ * @param[in]	B	Pointer current block of input matrix B.
+ * @param[inout] C	Pointer current block of output matrix C.
+ * @param[in]	ldc	Offset between elements in adjacent columns in C.
+ * @param[in]	alpha	Scalar factor.
+ */
+#define VECTOR_BLOCK(ROWS, COLS)                                              \
+	static inline void GEBP_block_##ROWS##_##COLS(                        \
+	    FLOAT const *restrict A, BLASLONG bk, FLOAT const *restrict B,    \
+	    FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) {                   \
+		_Static_assert(                                               \
+		    ROWS % VLEN_FLOATS == 0,                                  \
+		    "rows in block must be multiples of vector length");      \
+		vector_float Caux[ROWS / VLEN_FLOATS][COLS];                  \
+                                                                              \
+		for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++)             \
+			for (BLASLONG j = 0; j < COLS; j++)                   \
+				Caux[i][j] = vec_splats(ZERO);                \
+                                                                              \
+		/*                                                            \
+		 * Stream over the row-block of A, which is packed            \
+		 * column-by-column, multiply by coefficients in B and add up \
+		 * into temporaries Caux (which the compiler will hold in     \
+		 * registers). Vectorization: Multiply column vectors from A  \
+		 * with scalars from B and add up in column vectors of Caux.  \
+		 * That equates to unrolling the loop over rows (in i) and    \
+		 * executing each unrolled iteration as a vector element.     \
+		 */                                                           \
+		for (BLASLONG k = 0; k < bk; k++) {                           \
+			for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) {   \
+				vector_float Ak =                             \
+				    *(vector_float *)(A + i * VLEN_FLOATS +   \
+						      k * ROWS);              \
+                                                                              \
+				for (BLASLONG j = 0; j < COLS; j++)           \
+					Caux[i][j] += Ak * B[j + k * COLS];   \
+			}                                                     \
+		}                                                             \
+                                                                              \
+		/*                                                            \
+		 * Unpack row-block of C_aux into outer C_i, multiply by      \
+		 * alpha and add up.                                          \
+		 */                                                           \
+		for (BLASLONG j = 0; j < COLS; j++) {                         \
+			for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) {   \
+				vector_float *C_ij =                          \
+				    (vector_float *)(C + i * VLEN_FLOATS +    \
+						     j * ldc);                \
+				*C_ij += alpha * Caux[i][j];                  \
+			}                                                     \
+		}                                                             \
+	}
+
+
+VECTOR_BLOCK(8, 4)
+VECTOR_BLOCK(8, 2)
+VECTOR_BLOCK(8, 1)
+VECTOR_BLOCK(4, 4)
+VECTOR_BLOCK(4, 2)
+VECTOR_BLOCK(4, 1)
+
+#ifdef DOUBLE
+VECTOR_BLOCK(2, 4)
+VECTOR_BLOCK(2, 2)
+#endif
+
+/**
+ * Handle calculation for row blocks in C_i of any size by dispatching into
+ * macro-defined (inline) functions or by deferring to a simple generic
+ * implementation. Note that the compiler can remove this awkward-looking
+ * dispatching code while inlineing.
+ *
+ * @param[in]	m	Number of rows in block C_i.
+ * @param[in]	n	Number of columns in block C_i.
+ * @param[in]	first_row Index of first row of the block C_i (relative to C).
+ * @param[in]	A	Pointer to input matrix A (note: all of it).
+ * @param[in]	k	Number of columns in A and rows in B.
+ * @param[in]	B	Pointer to current column block (panel) of input matrix B.
+ * @param[inout] C	Pointer to current column block (panel) of output matrix C.
+ * @param[in]	ldc	Offset between elements in adjacent columns in C.
+ * @param[in]	alpha	Scalar factor.
+ */
+static inline void GEBP_block(BLASLONG m, BLASLONG n,
+		       BLASLONG first_row,
+		       const FLOAT * restrict A, BLASLONG k,
+		       const FLOAT * restrict B,
+		       FLOAT *restrict C, BLASLONG ldc,
+		       FLOAT alpha)
+{
+	A += first_row * k;
+	C += first_row;
+
+#define BLOCK(bm, bn)                                           \
+	if (m == bm && n == bn) {                               \
+		GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \
+		return;                                         \
+	}
+
+	BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1);
+	BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1);
+
+	#ifdef DOUBLE
+	BLOCK(2, 4);
+	BLOCK(2, 2);
+	#endif
+
+#undef BLOCK
+
+	/* simple implementation for smaller block sizes: */
+	FLOAT Caux[m][n] __attribute__ ((aligned (16)));
+
+	/*
+	 * Peel off first iteration (i.e., column of A) for initializing Caux
+	 */
+	for (BLASLONG i = 0; i < m; i++)
+		for (BLASLONG j = 0; j < n; j++)
+			Caux[i][j] = A[i] * B[j];
+
+	for (BLASLONG kk = 1; kk < k; kk++)
+		for (BLASLONG i = 0; i < m; i++)
+			for (BLASLONG j = 0; j < n; j++)
+				Caux[i][j] += A[i + kk * m] * B[j + kk * n];
+
+	for (BLASLONG i = 0; i < m; i++)
+		for (BLASLONG j = 0; j < n; j++)
+			C[i + j * ldc] += alpha * Caux[i][j];
+}
+
+/**
+ * Handle a column block (panel) of C and B while calculating C += alpha(A * B).
+ *
+ * @param[in]	num_cols	Number of columns in the block (in C and B).
+ * @param[in]	first_col	First column of the current block (in C and B).
+ * @param[in]	A	Pointer to input matrix A.
+ * @param[in]	bk	Number of columns in A and rows in B.
+ * @param[in]	B	Pointer to input matrix B (note: all of it).
+ * @param[in]	bm	Number of rows in C and A.
+ * @param[inout] C	Pointer to output matrix C (note: all of it).
+ * @param[in]	ldc	Offset between elements in adjacent columns in C.
+ * @param[in]	alpha	Scalar factor.
+ */
+static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
+			const FLOAT *restrict A, BLASLONG bk,
+			const FLOAT *restrict B, BLASLONG bm,
+			FLOAT *restrict C, BLASLONG ldc,
+			FLOAT alpha) {
+	FLOAT *restrict C_i = C + first_col * ldc;
+	/*
+	 * B is in column-order with n_r packed row elements, which does
+	 * not matter -- we always move in full such blocks of
+	 * column*pack
+	 */
+	const FLOAT *restrict B_i = B + first_col * bk;
+
+	/*
+	 * Calculate C_aux := A * B_j
+	 * then unpack C_i += alpha * C_aux.
+	 *
+	 * For that purpose, further partition C_aux and A into blocks
+	 * of m_r (unroll_m) rows, or powers-of-2 if smaller.
+	 */
+	BLASLONG row = 0;
+	for (BLASLONG block_size = unroll_m; block_size > 0; block_size /= 2)
+		for (; bm - row >= block_size; row += block_size)
+			GEBP_block(block_size, num_cols, row, A, bk, B_i, C_i,
+				   ldc, alpha);
+}
+
+/**
+ * Inner kernel for matrix-matrix multiplication. C += alpha (A * B)
+ * where C is an m-by-n matrix, A is m-by-k and B is k-by-n. Note that A, B, and
+ * C are pointers to submatrices of the actual matrices.
+ *
+ * @param[in]	bm	Number of rows in C and A.
+ * @param[in]	bn	Number of columns in C and B.
+ * @param[in]	bk	Number of columns in A and rows in B.
+ * @param[in]	alpha	Scalar factor.
+ * @param[in]	ba	Pointer to input matrix A.
+ * @param[in]	bb	Pointer to input matrix B.
+ * @param[inout] C	Pointer to output matrix C.
+ * @param[in]	ldc	Offset between elements in adjacent columns in C.
+ * @returns 0 on success.
+ */
+int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha,
+	  FLOAT *restrict ba, FLOAT *restrict bb,
+	  FLOAT *restrict C, BLASLONG ldc)
+{
+	if ( (bm == 0) || (bn == 0) || (bk == 0) || (alpha == ZERO))
+		return 0;
+
+	/*
+	 * interface code allocates buffers for ba and bb at page
+	 * granularity (i.e., using mmap(MAP_ANONYMOUS), so enable the compiler
+	 * to make use of the fact in vector load operations.
+	 */
+	ba = __builtin_assume_aligned(ba, 16);
+	bb = __builtin_assume_aligned(bb, 16);
+
+	/*
+	 * Partition B and C into blocks of n_r (unroll_n) columns, called B_i
+	 * and C_i. For each partition, calculate C_i += alpha * (A * B_j).
+	 *
+	 * For remaining columns that do not fill up a block of n_r, iteratively
+	 * use smaller block sizes of powers of 2.
+	 */
+	BLASLONG col = 0;
+	for (BLASLONG block_size = unroll_n; block_size > 0; block_size /= 2)
+		for (; bn - col >= block_size; col += block_size)
+			GEBP_column_block(block_size, col, ba, bk, bb, bm, C, ldc, alpha);
+
+   return 0;
+}

From 71b6eaf459e55e7b5fe5047052c39c49f16c3680 Mon Sep 17 00:00:00 2001
From: Marius Hillenbrand <mhillen@linux.ibm.com>
Date: Tue, 12 May 2020 14:40:30 +0200
Subject: [PATCH 11/14] s390x: Use new sgemm kernel also for strmm on Z14 and
 newer

Employ the newly added GEMM kernel also for STRMM on Z14. The
implementation in C with vector intrinsics exploits FP32 SIMD operations
and thereby gains performance over the existing assembly code. Extend
the implementation for handling triangular matrix multiplication,
accordingly. As added benefit, the more flexible C code enables us to
adjust register blocking in the subsequent commit.

Tested via make -C test / ctest / utest and by a couple of additional
unit tests that exercise blocking.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
---
 kernel/zarch/KERNEL.Z14 |   8 +---
 kernel/zarch/gemm_vec.c | 104 +++++++++++++++++++++++++++++++++++++---
 2 files changed, 98 insertions(+), 14 deletions(-)

diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14
index bd3a966b1..49fa28175 100644
--- a/kernel/zarch/KERNEL.Z14
+++ b/kernel/zarch/KERNEL.Z14
@@ -86,7 +86,7 @@ DGEMVTKERNEL = dgemv_t_4.c
 CGEMVTKERNEL = cgemv_t_4.c
 ZGEMVTKERNEL = zgemv_t_4.c
 
-STRMMKERNEL	= strmm8x4V.S
+STRMMKERNEL	= gemm_vec.c
 DTRMMKERNEL	= trmm8x4V.S
 CTRMMKERNEL	= ctrmm4x4V.S
 ZTRMMKERNEL	= ztrmm4x4V.S
@@ -101,8 +101,6 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
-
-
 DGEMMKERNEL    =  gemm8x4V.S
 DGEMMINCOPY    = ../generic/gemm_ncopy_8.c
 DGEMMITCOPY    = ../generic/gemm_tcopy_8.c
@@ -145,7 +143,3 @@ ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
 ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 
-
-
-
-
diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c
index e6d613c44..a9531c7a5 100644
--- a/kernel/zarch/gemm_vec.c
+++ b/kernel/zarch/gemm_vec.c
@@ -51,6 +51,29 @@
 static const size_t unroll_m = UNROLL_M;
 static const size_t unroll_n = UNROLL_N;
 
+/* Handling of triangular matrices */
+#ifdef TRMMKERNEL
+static const bool trmm = true;
+static const bool left =
+#ifdef LEFT
+	true;
+#else
+	false;
+#endif
+
+static const bool backwards =
+#if defined(LEFT) != defined(TRANSA)
+	true;
+#else
+	false;
+#endif
+
+#else
+static const bool trmm = false;
+static const bool left = false;
+static const bool backwards = false;
+#endif /* TRMMKERNEL */
+
 /*
  * Background:
  *
@@ -111,6 +134,17 @@ static const size_t unroll_n = UNROLL_N;
  *      vectorization for varying block sizes)
  *    - add alpha * row block of C_aux back into C_j.
  *
+ * Note that there are additional mechanics for handling triangular matrices,
+ * calculating B := alpha (A * B) where either of the matrices A or B can be
+ * triangular. In case of A, the macro "LEFT" is defined. In addition, A can
+ * optionally be transposed.
+ * The code effectively skips an "offset" number of columns in A and rows of B
+ * in each block, to save unnecessary work by exploiting the triangular nature.
+ * To handle all cases, the code discerns (1) a "left" mode when A is triangular
+ * and (2) "forward" / "backwards" modes where only the first "offset"
+ * columns/rows of A/B are used or where the first "offset" columns/rows are
+ * skipped, respectively.
+ *
  * Reference:
  *
  * The summary above is based on staring at various kernel implementations and:
@@ -176,7 +210,11 @@ typedef FLOAT vector_float __attribute__ ((vector_size (16)));
 				vector_float *C_ij =                          \
 				    (vector_float *)(C + i * VLEN_FLOATS +    \
 						     j * ldc);                \
-				*C_ij += alpha * Caux[i][j];                  \
+				if (trmm) {                                   \
+					*C_ij = alpha * Caux[i][j];           \
+				} else {                                      \
+					*C_ij += alpha * Caux[i][j];          \
+				}                                             \
 			}                                                     \
 		}                                                             \
 	}
@@ -209,17 +247,37 @@ VECTOR_BLOCK(2, 2)
  * @param[inout] C	Pointer to current column block (panel) of output matrix C.
  * @param[in]	ldc	Offset between elements in adjacent columns in C.
  * @param[in]	alpha	Scalar factor.
+ * @param[in]	offset  Number of columns of A and rows of B to skip (for triangular matrices).
+ * @param[in]	off	Running offset for handling triangular matrices.
  */
 static inline void GEBP_block(BLASLONG m, BLASLONG n,
 		       BLASLONG first_row,
 		       const FLOAT * restrict A, BLASLONG k,
 		       const FLOAT * restrict B,
 		       FLOAT *restrict C, BLASLONG ldc,
-		       FLOAT alpha)
+		       FLOAT alpha,
+		       BLASLONG offset, BLASLONG off)
 {
+	if (trmm && left)
+		off = offset + first_row;
+
 	A += first_row * k;
 	C += first_row;
 
+	if (trmm) {
+		if (backwards) {
+			A += off * m;
+			B += off * n;
+			k -= off;
+		} else {
+			if (left) {
+				k = off + m;
+			} else {
+				k = off + n;
+			}
+		}
+	}
+
 #define BLOCK(bm, bn)                                           \
 	if (m == bm && n == bn) {                               \
 		GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \
@@ -253,7 +311,11 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n,
 
 	for (BLASLONG i = 0; i < m; i++)
 		for (BLASLONG j = 0; j < n; j++)
-			C[i + j * ldc] += alpha * Caux[i][j];
+			if (trmm) {
+				C[i + j * ldc] = alpha * Caux[i][j];
+			} else {
+				C[i + j * ldc] += alpha * Caux[i][j];
+			}
 }
 
 /**
@@ -268,12 +330,15 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n,
  * @param[inout] C	Pointer to output matrix C (note: all of it).
  * @param[in]	ldc	Offset between elements in adjacent columns in C.
  * @param[in]	alpha	Scalar factor.
+ * @param[in]	offset	Number of columns of A and rows of B to skip (for triangular matrices).
  */
 static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
 			const FLOAT *restrict A, BLASLONG bk,
 			const FLOAT *restrict B, BLASLONG bm,
 			FLOAT *restrict C, BLASLONG ldc,
-			FLOAT alpha) {
+			FLOAT alpha,
+			BLASLONG const offset) {
+
 	FLOAT *restrict C_i = C + first_col * ldc;
 	/*
 	 * B is in column-order with n_r packed row elements, which does
@@ -282,6 +347,15 @@ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
 	 */
 	const FLOAT *restrict B_i = B + first_col * bk;
 
+	BLASLONG off = 0;
+	if (trmm) {
+		if (left) {
+			off = offset;
+		} else {
+			off = -offset + first_col;
+		}
+	}
+
 	/*
 	 * Calculate C_aux := A * B_j
 	 * then unpack C_i += alpha * C_aux.
@@ -293,7 +367,7 @@ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
 	for (BLASLONG block_size = unroll_m; block_size > 0; block_size /= 2)
 		for (; bm - row >= block_size; row += block_size)
 			GEBP_block(block_size, num_cols, row, A, bk, B_i, C_i,
-				   ldc, alpha);
+				   ldc, alpha, offset, off);
 }
 
 /**
@@ -301,6 +375,9 @@ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
  * where C is an m-by-n matrix, A is m-by-k and B is k-by-n. Note that A, B, and
  * C are pointers to submatrices of the actual matrices.
  *
+ * For triangular matrix multiplication, calculate B := alpha (A * B) where A
+ * or B can be triangular (in case of A, the macro LEFT will be defined).
+ *
  * @param[in]	bm	Number of rows in C and A.
  * @param[in]	bn	Number of columns in C and B.
  * @param[in]	bk	Number of columns in A and rows in B.
@@ -309,11 +386,16 @@ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
  * @param[in]	bb	Pointer to input matrix B.
  * @param[inout] C	Pointer to output matrix C.
  * @param[in]	ldc	Offset between elements in adjacent columns in C.
+ * @param[in]	offset	Number of columns of A and rows of B to skip (for triangular matrices).
  * @returns 0 on success.
  */
 int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha,
 	  FLOAT *restrict ba, FLOAT *restrict bb,
-	  FLOAT *restrict C, BLASLONG ldc)
+	  FLOAT *restrict C, BLASLONG ldc
+#ifdef TRMMKERNEL
+	  , BLASLONG offset
+#endif
+	  )
 {
 	if ( (bm == 0) || (bn == 0) || (bk == 0) || (alpha == ZERO))
 		return 0;
@@ -326,6 +408,14 @@ int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha,
 	ba = __builtin_assume_aligned(ba, 16);
 	bb = __builtin_assume_aligned(bb, 16);
 
+	/*
+	 * Use offset and off even when compiled as SGEMMKERNEL to simplify
+	 * function signatures and function calls.
+	 */
+#ifndef TRMMKERNEL
+	BLASLONG const offset = 0;
+#endif
+
 	/*
 	 * Partition B and C into blocks of n_r (unroll_n) columns, called B_i
 	 * and C_i. For each partition, calculate C_i += alpha * (A * B_j).
@@ -336,7 +426,7 @@ int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha,
 	BLASLONG col = 0;
 	for (BLASLONG block_size = unroll_n; block_size > 0; block_size /= 2)
 		for (; bn - col >= block_size; col += block_size)
-			GEBP_column_block(block_size, col, ba, bk, bb, bm, C, ldc, alpha);
+			GEBP_column_block(block_size, col, ba, bk, bb, bm, C, ldc, alpha, offset);
 
    return 0;
 }

From 1b0b4349a11f8de40037d9bddf9ddb9b094cdd2c Mon Sep 17 00:00:00 2001
From: Marius Hillenbrand <mhillen@linux.ibm.com>
Date: Tue, 12 May 2020 15:06:38 +0200
Subject: [PATCH 12/14] s390x/Z14: Change register blocking for SGEMM to 16x4

Change register blocking for SGEMM (and STRMM) on z14 from 8x4 to 16x4
by adjusting SGEMM_DEFAULT_UNROLL_M and choosing the appropriate copy
implementations. Actually make KERNEL.Z14 more flexible, so that the
change in param.h suffices. As a result, performance for SGEMM improves
by around 30% on z15.

On z14, FP SIMD instructions can operate on float-sized scalars in
vector registers, while z13 could do that for double-sized scalars only.
Thus, we can double the amount of elements of C that are held in
registers in an SGEMM kernel.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
---
 kernel/zarch/KERNEL.Z14 | 10 ++++++----
 kernel/zarch/gemm_vec.c | 15 +++++++++++++++
 param.h                 |  2 +-
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14
index 49fa28175..96e6745fd 100644
--- a/kernel/zarch/KERNEL.Z14
+++ b/kernel/zarch/KERNEL.Z14
@@ -92,12 +92,14 @@ CTRMMKERNEL	= ctrmm4x4V.S
 ZTRMMKERNEL	= ztrmm4x4V.S
 
 SGEMMKERNEL    = gemm_vec.c
-SGEMMINCOPY    = ../generic/gemm_ncopy_8.c
-SGEMMITCOPY    = ../generic/gemm_tcopy_8.c
-SGEMMONCOPY    = ../generic/gemm_ncopy_4.c
-SGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
+ifneq ($(SGEMM_UNROLL_M),$(SGEMM_UNROLL_N))
+SGEMMINCOPY    = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+SGEMMITCOPY    = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
 SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+SGEMMONCOPY    = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+SGEMMOTCOPY    = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
 SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c
index a9531c7a5..4e1b3e3fb 100644
--- a/kernel/zarch/gemm_vec.c
+++ b/kernel/zarch/gemm_vec.c
@@ -220,6 +220,15 @@ typedef FLOAT vector_float __attribute__ ((vector_size (16)));
 	}
 
 
+#if UNROLL_M == 16
+VECTOR_BLOCK(16, 4)
+VECTOR_BLOCK(16, 2)
+VECTOR_BLOCK(16, 1)
+#endif
+#if UNROLL_N == 8
+VECTOR_BLOCK(8, 8)
+VECTOR_BLOCK(4, 8)
+#endif
 VECTOR_BLOCK(8, 4)
 VECTOR_BLOCK(8, 2)
 VECTOR_BLOCK(8, 1)
@@ -284,6 +293,12 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n,
 		return;                                         \
 	}
 
+#if UNROLL_M == 16
+	BLOCK(16, 4); BLOCK(16, 2); BLOCK(16, 1);
+#endif
+#if UNROLL_N == 8
+	BLOCK(8, 8); BLOCK(4, 8);
+#endif
 	BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1);
 	BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1);
 
diff --git a/param.h b/param.h
index 7094249e8..6f0a3b727 100644
--- a/param.h
+++ b/param.h
@@ -2999,7 +2999,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define GEMM_DEFAULT_OFFSET_B 0
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
 
-#define SGEMM_DEFAULT_UNROLL_M  8
+#define SGEMM_DEFAULT_UNROLL_M  16
 #define SGEMM_DEFAULT_UNROLL_N  4
 
 #define DGEMM_DEFAULT_UNROLL_M  8

From cb9dc36dd5d7ecf40cd8f3d8e9ffe08bc525c427 Mon Sep 17 00:00:00 2001
From: Marius Hillenbrand <mhillen@linux.ibm.com>
Date: Tue, 12 May 2020 16:14:00 +0200
Subject: [PATCH 13/14] Update CONTRIBUTORS.md

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 738475a93..fd4ab4bec 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -186,3 +186,4 @@ In chronological order:
 
 * Marius Hillenbrand <https://github.com/mhillenibm>
   * [2020-05-12] Revise dynamic architecture detection for IBM z
+  * [2020-05-12] Add new sgemm and strmm kernel for IBM z14

From 2840432e49ca57f8338c46575a44dfe1416a20d3 Mon Sep 17 00:00:00 2001
From: Marius Hillenbrand <mhillen@linux.ibm.com>
Date: Wed, 13 May 2020 17:48:50 +0200
Subject: [PATCH 14/14] s390x: improvise vector alignment hints for older
 compilers

Introduce inline assembly so that we can employ vector loads with
alignment hints on older compilers (pre gcc-9), since these are still
used in distributions such as RHEL 8 and Ubuntu 18.04 LTS.

Informing the hardware about alignment can speed up vector loads. For
that purpose, we can encode hints about 8-byte or 16-byte alignment of
the memory operand into the opcodes. gcc-9 and newer automatically emit
such hints, where applicable. Add a bit of inline assembly that achieves
the same for older compilers. Since an older binutils may not know about
the additional operand for the hints, we explicitly encode the opcode in
hex.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
---
 kernel/zarch/gemm_vec.c | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c
index 4e1b3e3fb..2d4457f06 100644
--- a/kernel/zarch/gemm_vec.c
+++ b/kernel/zarch/gemm_vec.c
@@ -158,6 +158,32 @@ static const bool backwards = false;
 
 typedef FLOAT vector_float __attribute__ ((vector_size (16)));
 
+/**
+ * Load a vector into register, and hint on 8-byte alignment to improve
+ * performance. gcc-9 and newer will create these hints by itself. For older
+ * compiler versions, use inline assembly to explicitly express the hint.
+ * Provide explicit hex encoding to cater for binutils versions that do not know
+ * about vector-load with alignment hints yet.
+ *
+ * Note that, for block sizes where we apply vectorization, vectors in A will
+ * always be 8-byte aligned.
+ */
+static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
+	vector_float const *restrict addr = (vector_float const *restrict)a;
+	vector_float y;
+
+#if __GNUC__ < 9
+	// hex-encode vl %[out],%[addr],3
+	asm(".insn vrx,0xe70000003006,%[out],%[addr],3"
+	    : [ out ] "=v"(y)
+	    : [ addr ] "R"(*addr));
+#else
+	y = *addr;
+#endif
+
+	return y;
+}
+
 /**
  * Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics.
  *
@@ -192,9 +218,8 @@ typedef FLOAT vector_float __attribute__ ((vector_size (16)));
 		 */                                                           \
 		for (BLASLONG k = 0; k < bk; k++) {                           \
 			for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) {   \
-				vector_float Ak =                             \
-				    *(vector_float *)(A + i * VLEN_FLOATS +   \
-						      k * ROWS);              \
+				vector_float Ak = vec_load_hinted(            \
+				    A + i * VLEN_FLOATS + k * ROWS);          \
                                                                               \
 				for (BLASLONG j = 0; j < COLS; j++)           \
 					Caux[i][j] += Ak * B[j + k * COLS];   \