From 2840432e49ca57f8338c46575a44dfe1416a20d3 Mon Sep 17 00:00:00 2001
From: Marius Hillenbrand <mhillen@linux.ibm.com>
Date: Wed, 13 May 2020 17:48:50 +0200
Subject: [PATCH] s390x: improvise vector alignment hints for older compilers

Introduce inline assembly so that we can employ vector loads with
alignment hints on older compilers (pre gcc-9), since these are still
used in distributions such as RHEL 8 and Ubuntu 18.04 LTS.

Informing the hardware about alignment can speed up vector loads. For
that purpose, we can encode hints about 8-byte or 16-byte alignment of
the memory operand into the opcodes. gcc-9 and newer automatically emit
such hints, where applicable. Add a bit of inline assembly that achieves
the same for older compilers. Since an older binutils may not know about
the additional operand for the hints, we explicitly encode the opcode in
hex.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
---
 kernel/zarch/gemm_vec.c | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c
index 4e1b3e3fb..2d4457f06 100644
--- a/kernel/zarch/gemm_vec.c
+++ b/kernel/zarch/gemm_vec.c
@@ -158,6 +158,32 @@ static const bool backwards = false;
 
 typedef FLOAT vector_float __attribute__ ((vector_size (16)));
 
+/**
+ * Load a vector into register, and hint on 8-byte alignment to improve
+ * performance. gcc-9 and newer will create these hints by itself. For older
+ * compiler versions, use inline assembly to explicitly express the hint.
+ * Provide explicit hex encoding to cater for binutils versions that do not know
+ * about vector-load with alignment hints yet.
+ *
+ * Note that, for block sizes where we apply vectorization, vectors in A will
+ * always be 8-byte aligned.
+ */
+static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
+	vector_float const *restrict addr = (vector_float const *restrict)a;
+	vector_float y;
+
+#if __GNUC__ < 9
+	// hex-encode vl %[out],%[addr],3
+	asm(".insn vrx,0xe70000003006,%[out],%[addr],3"
+	    : [ out ] "=v"(y)
+	    : [ addr ] "R"(*addr));
+#else
+	y = *addr;
+#endif
+
+	return y;
+}
+
 /**
  * Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics.
  *
@@ -192,9 +218,8 @@ typedef FLOAT vector_float __attribute__ ((vector_size (16)));
 		 */                                                           \
 		for (BLASLONG k = 0; k < bk; k++) {                           \
 			for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) {   \
-				vector_float Ak =                             \
-				    *(vector_float *)(A + i * VLEN_FLOATS +   \
-						      k * ROWS);              \
+				vector_float Ak = vec_load_hinted(            \
+				    A + i * VLEN_FLOATS + k * ROWS);          \
                                                                               \
 				for (BLASLONG j = 0; j < COLS; j++)           \
 					Caux[i][j] += Ak * B[j + k * COLS];   \