From 346e30a46a4758eb4d9b8e5783c0b9c3c6b3ce6f Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Thu, 10 Dec 2020 11:51:42 -0600 Subject: [PATCH] POWER10: Improve axpy performance This patch aligns the stores to 32 byte boundary for saxpy and daxpy before entering into vector pair loop. Fox caxpy, changed the store instructions to stxv to improve performance of unaligned cases. --- kernel/power/caxpy_microk_power10.c | 24 ++++++++++++++++-------- kernel/power/daxpy_power10.c | 15 +++++++++++---- kernel/power/saxpy_power10.c | 14 ++++++++++---- 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/kernel/power/caxpy_microk_power10.c b/kernel/power/caxpy_microk_power10.c index 0d13416b3..56a5ab47a 100644 --- a/kernel/power/caxpy_microk_power10.c +++ b/kernel/power/caxpy_microk_power10.c @@ -112,10 +112,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "xvmaddasp 38, 58, 33 \n\t" "xvmaddasp 39, 59, 33 \n\t" - "stxvp 48, 0(%4) \n\t" - "stxvp 50, 32(%4) \n\t" - "stxvp 34, 64(%4) \n\t" - "stxvp 38, 96(%4) \n\t" + "stxv 49, 0(%4) \n\t" + "stxv 48, 16(%4) \n\t" + "stxv 51, 32(%4) \n\t" + "stxv 50, 48(%4) \n\t" + "stxv 35, 64(%4) \n\t" + "stxv 34, 80(%4) \n\t" + "stxv 39, 96(%4) \n\t" + "stxv 38, 112(%4) \n\t" "addi %4, %4, 128 \n\t" "xxperm 52, 40, %x10 \n\t" // exchange real and imag part @@ -159,10 +163,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "xvmaddasp 38, 58, 33 \n\t" "xvmaddasp 39, 59, 33 \n\t" - "stxvp 48, 0(%4) \n\t" - "stxvp 50, 32(%4) \n\t" - "stxvp 34, 64(%4) \n\t" - "stxvp 38, 96(%4) \n\t" + "stxv 49, 0(%4) \n\t" + "stxv 48, 16(%4) \n\t" + "stxv 51, 32(%4) \n\t" + "stxv 50, 48(%4) \n\t" + "stxv 35, 64(%4) \n\t" + "stxv 34, 80(%4) \n\t" + "stxv 39, 96(%4) \n\t" + "stxv 38, 112(%4) \n\t" "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n" : diff --git a/kernel/power/daxpy_power10.c b/kernel/power/daxpy_power10.c index ebe91a80f..8640efcfd 100644 --- a/kernel/power/daxpy_power10.c +++ b/kernel/power/daxpy_power10.c @@ -66,12 +66,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( (inc_x == 1) && (inc_y == 1) ) { - BLASLONG n1 = n & -16; + if ( n >= 16 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; + for (i = 0; i < align; i++) { + y[i] += da * x[i] ; + } + } + BLASLONG n1 = (n-i) & -16; + if ( n1 ) + daxpy_kernel_8(n1, &x[i], &y[i], da); - if ( n1 ) - daxpy_kernel_8(n1, x, y, da); + i += n1; - i = n1; while(i < n) { diff --git a/kernel/power/saxpy_power10.c b/kernel/power/saxpy_power10.c index 8c7c22390..4a13c1f88 100644 --- a/kernel/power/saxpy_power10.c +++ b/kernel/power/saxpy_power10.c @@ -64,12 +64,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( (inc_x == 1) && (inc_y == 1) ) { - BLASLONG n1 = n & -64; - + if ( n >= 64 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; + for (i = 0; i < align; i++) { + y[i] += da * x[i] ; + } + } + BLASLONG n1 = (n-i) & -64; if ( n1 ) - saxpy_kernel_64(n1, x, y, da); + saxpy_kernel_64(n1, &x[i], &y[i], da); - i = n1; + i += n1; while(i < n) {