diff --git a/kernel/power/caxpy_microk_power10.c b/kernel/power/caxpy_microk_power10.c index 0d13416b3..56a5ab47a 100644 --- a/kernel/power/caxpy_microk_power10.c +++ b/kernel/power/caxpy_microk_power10.c @@ -112,10 +112,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "xvmaddasp 38, 58, 33 \n\t" "xvmaddasp 39, 59, 33 \n\t" - "stxvp 48, 0(%4) \n\t" - "stxvp 50, 32(%4) \n\t" - "stxvp 34, 64(%4) \n\t" - "stxvp 38, 96(%4) \n\t" + "stxv 49, 0(%4) \n\t" + "stxv 48, 16(%4) \n\t" + "stxv 51, 32(%4) \n\t" + "stxv 50, 48(%4) \n\t" + "stxv 35, 64(%4) \n\t" + "stxv 34, 80(%4) \n\t" + "stxv 39, 96(%4) \n\t" + "stxv 38, 112(%4) \n\t" "addi %4, %4, 128 \n\t" "xxperm 52, 40, %x10 \n\t" // exchange real and imag part @@ -159,10 +163,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "xvmaddasp 38, 58, 33 \n\t" "xvmaddasp 39, 59, 33 \n\t" - "stxvp 48, 0(%4) \n\t" - "stxvp 50, 32(%4) \n\t" - "stxvp 34, 64(%4) \n\t" - "stxvp 38, 96(%4) \n\t" + "stxv 49, 0(%4) \n\t" + "stxv 48, 16(%4) \n\t" + "stxv 51, 32(%4) \n\t" + "stxv 50, 48(%4) \n\t" + "stxv 35, 64(%4) \n\t" + "stxv 34, 80(%4) \n\t" + "stxv 39, 96(%4) \n\t" + "stxv 38, 112(%4) \n\t" "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n" : diff --git a/kernel/power/daxpy_power10.c b/kernel/power/daxpy_power10.c index ebe91a80f..8640efcfd 100644 --- a/kernel/power/daxpy_power10.c +++ b/kernel/power/daxpy_power10.c @@ -66,12 +66,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( (inc_x == 1) && (inc_y == 1) ) { - BLASLONG n1 = n & -16; + if ( n >= 16 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; + for (i = 0; i < align; i++) { + y[i] += da * x[i] ; + } + } + BLASLONG n1 = (n-i) & -16; + if ( n1 ) + daxpy_kernel_8(n1, &x[i], &y[i], da); - if ( n1 ) - daxpy_kernel_8(n1, x, y, da); + i += n1; - i = n1; while(i < n) { diff --git a/kernel/power/saxpy_power10.c b/kernel/power/saxpy_power10.c index 8c7c22390..4a13c1f88 100644 --- a/kernel/power/saxpy_power10.c +++ b/kernel/power/saxpy_power10.c @@ -64,12 +64,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( (inc_x == 1) && (inc_y == 1) ) { - BLASLONG n1 = n & -64; - + if ( n >= 64 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; + for (i = 0; i < align; i++) { + y[i] += da * x[i] ; + } + } + BLASLONG n1 = (n-i) & -64; if ( n1 ) - saxpy_kernel_64(n1, x, y, da); + saxpy_kernel_64(n1, &x[i], &y[i], da); - i = n1; + i += n1; while(i < n) {