Merge pull request #2615 from mhillenibm/z14_alignment_hints

s390x: improvise vector alignment hints for older compilers
This commit is contained in:
Martin Kroeker 2020-05-14 21:06:34 +02:00 committed by GitHub
commit 20245ded5f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 28 additions and 3 deletions

View File

@ -158,6 +158,32 @@ static const bool backwards = false;
typedef FLOAT vector_float __attribute__ ((vector_size (16))); typedef FLOAT vector_float __attribute__ ((vector_size (16)));
/**
* Load a vector into register, and hint on 8-byte alignment to improve
* performance. gcc-9 and newer will create these hints by itself. For older
* compiler versions, use inline assembly to explicitly express the hint.
* Provide explicit hex encoding to cater for binutils versions that do not know
* about vector-load with alignment hints yet.
*
* Note that, for block sizes where we apply vectorization, vectors in A will
* always be 8-byte aligned.
*/
static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
vector_float const *restrict addr = (vector_float const *restrict)a;
vector_float y;
#if __GNUC__ < 9
// hex-encode vl %[out],%[addr],3
asm(".insn vrx,0xe70000003006,%[out],%[addr],3"
: [ out ] "=v"(y)
: [ addr ] "R"(*addr));
#else
y = *addr;
#endif
return y;
}
/** /**
* Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics. * Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics.
* *
@ -192,9 +218,8 @@ typedef FLOAT vector_float __attribute__ ((vector_size (16)));
*/ \ */ \
for (BLASLONG k = 0; k < bk; k++) { \ for (BLASLONG k = 0; k < bk; k++) { \
for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \ for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \
vector_float Ak = \ vector_float Ak = vec_load_hinted( \
*(vector_float *)(A + i * VLEN_FLOATS + \ A + i * VLEN_FLOATS + k * ROWS); \
k * ROWS); \
\ \
for (BLASLONG j = 0; j < COLS; j++) \ for (BLASLONG j = 0; j < COLS; j++) \
Caux[i][j] += Ak * B[j + k * COLS]; \ Caux[i][j] += Ak * B[j + k * COLS]; \