diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c index 4e1b3e3fb..2d4457f06 100644 --- a/kernel/zarch/gemm_vec.c +++ b/kernel/zarch/gemm_vec.c @@ -158,6 +158,32 @@ static const bool backwards = false; typedef FLOAT vector_float __attribute__ ((vector_size (16))); +/** + * Load a vector into register, and hint on 8-byte alignment to improve + * performance. gcc-9 and newer will create these hints by itself. For older + * compiler versions, use inline assembly to explicitly express the hint. + * Provide explicit hex encoding to cater for binutils versions that do not know + * about vector-load with alignment hints yet. + * + * Note that, for block sizes where we apply vectorization, vectors in A will + * always be 8-byte aligned. + */ +static inline vector_float vec_load_hinted(FLOAT const *restrict a) { + vector_float const *restrict addr = (vector_float const *restrict)a; + vector_float y; + +#if __GNUC__ < 9 + // hex-encode vl %[out],%[addr],3 + asm(".insn vrx,0xe70000003006,%[out],%[addr],3" + : [ out ] "=v"(y) + : [ addr ] "R"(*addr)); +#else + y = *addr; +#endif + + return y; +} + /** * Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics. * @@ -192,9 +218,8 @@ typedef FLOAT vector_float __attribute__ ((vector_size (16))); */ \ for (BLASLONG k = 0; k < bk; k++) { \ for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \ - vector_float Ak = \ - *(vector_float *)(A + i * VLEN_FLOATS + \ - k * ROWS); \ + vector_float Ak = vec_load_hinted( \ + A + i * VLEN_FLOATS + k * ROWS); \ \ for (BLASLONG j = 0; j < COLS; j++) \ Caux[i][j] += Ak * B[j + k * COLS]; \