Merge pull request #2615 from mhillenibm/z14_alignment_hints

s390x: improvise vector alignment hints for older compilers
2020-05-14 21:06:34 +02:00 · 2020-05-14 21:06:34 +02:00 · 20245ded5f
parent ea78106c71 2840432e49
commit 20245ded5f
1 changed files with 28 additions and 3 deletions
--- a/kernel/zarch/gemm_vec.c
+++ b/kernel/zarch/gemm_vec.c
@ -158,6 +158,32 @@ static const bool backwards = false;
 typedef FLOAT vector_float __attribute__ ((vector_size (16)));
 /**
 * Load a vector into register, and hint on 8-byte alignment to improve
 * performance. gcc-9 and newer will create these hints by itself. For older
 * compiler versions, use inline assembly to explicitly express the hint.
 * Provide explicit hex encoding to cater for binutils versions that do not know
 * about vector-load with alignment hints yet.
 *
 * Note that, for block sizes where we apply vectorization, vectors in A will
 * always be 8-byte aligned.
 */
 static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
 	vector_float const *restrict addr = (vector_float const *restrict)a;
 	vector_float y;
 #if __GNUC__ < 9
 	// hex-encode vl %[out],%[addr],3
 	asm(".insn vrx,0xe70000003006,%[out],%[addr],3"
 	    : [ out ] "=v"(y)
 	    : [ addr ] "R"(*addr));
 #else
 	y = *addr;
 #endif
 	return y;
 }
 /**
 * Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics.
 *
@ -192,9 +218,8 @@ typedef FLOAT vector_float __attribute__ ((vector_size (16)));
 		 */                                                           \
 		for (BLASLONG k = 0; k < bk; k++) {                           \
 			for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) {   \
-				vector_float Ak =                             \
+				vector_float Ak = vec_load_hinted(            \
-				    *(vector_float *)(A + i * VLEN_FLOATS +   \
+				    A + i * VLEN_FLOATS + k * ROWS);          \
 						      k * ROWS);              \
                                                                              \
 				for (BLASLONG j = 0; j < COLS; j++)           \
 					Caux[i][j] += Ak * B[j + k * COLS];   \