diff --git a/kernel/power/ccopy_microk_power10.c b/kernel/power/ccopy_microk_power10.c deleted file mode 100644 index f30e1fa09..000000000 --- a/kernel/power/ccopy_microk_power10.c +++ /dev/null @@ -1,152 +0,0 @@ -/*************************************************************************** -Copyright (c) 2020, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define HAVE_KERNEL 1 - -static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ - ( - "lxvp 32, 0(%2) \n\t" - "lxvp 34, 32(%2) \n\t" - "lxvp 36, 64(%2) \n\t" - "lxvp 38, 96(%2) \n\t" - "lxvp 40, 128(%2) \n\t" - "lxvp 42, 160(%2) \n\t" - "lxvp 44, 192(%2) \n\t" - "lxvp 46, 224(%2) \n\t" - - "addi %2, %2, 256 \n\t" - "addic. %1, %1, -32 \n\t" - "ble two%= \n\t" - - ".align 5 \n" - "one%=: \n\t" -#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) - "stxv 32, 0(%3) \n\t" - "stxv 33, 16(%3) \n\t" - "stxv 34, 32(%3) \n\t" - "stxv 35, 48(%3) \n\t" - "stxv 36, 64(%3) \n\t" - "stxv 37, 80(%3) \n\t" - "stxv 38, 96(%3) \n\t" - "stxv 39, 112(%3) \n\t" -#else - "stxv 33, 0(%3) \n\t" - "stxv 32, 16(%3) \n\t" - "stxv 35, 32(%3) \n\t" - "stxv 34, 48(%3) \n\t" - "stxv 37, 64(%3) \n\t" - "stxv 36, 80(%3) \n\t" - "stxv 39, 96(%3) \n\t" - "stxv 38, 112(%3) \n\t" -#endif - "lxvp 32, 0(%2) \n\t" - "lxvp 34, 32(%2) \n\t" - "lxvp 36, 64(%2) \n\t" - "lxvp 38, 96(%2) \n\t" -#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) - "stxv 40, 128(%3) \n\t" - "stxv 41, 144(%3) \n\t" - "stxv 42, 160(%3) \n\t" - "stxv 43, 176(%3) \n\t" - "stxv 44, 192(%3) \n\t" - "stxv 45, 208(%3) \n\t" - "stxv 46, 224(%3) \n\t" - "stxv 47, 240(%3) \n\t" -#else - "stxv 41, 128(%3) \n\t" - "stxv 40, 144(%3) \n\t" - "stxv 43, 160(%3) \n\t" - "stxv 42, 176(%3) \n\t" - "stxv 45, 192(%3) \n\t" - "stxv 44, 208(%3) \n\t" - "stxv 47, 224(%3) \n\t" - "stxv 46, 240(%3) \n\t" -#endif - "lxvp 40, 128(%2) \n\t" - "lxvp 42, 160(%2) \n\t" - "lxvp 44, 192(%2) \n\t" - "lxvp 46, 224(%2) \n\t" - - - "addi %3, %3, 256 \n\t" - "addi %2, %2, 256 \n\t" - - "addic. %1, %1, -32 \n\t" - "bgt one%= \n" - - "two%=: \n\t" -#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) - "stxv 32, 0(%3) \n\t" - "stxv 33, 16(%3) \n\t" - "stxv 34, 32(%3) \n\t" - "stxv 35, 48(%3) \n\t" - "stxv 36, 64(%3) \n\t" - "stxv 37, 80(%3) \n\t" - "stxv 38, 96(%3) \n\t" - "stxv 39, 112(%3) \n\t" - "stxv 40, 128(%3) \n\t" - "stxv 41, 144(%3) \n\t" - "stxv 42, 160(%3) \n\t" - "stxv 43, 176(%3) \n\t" - "stxv 44, 192(%3) \n\t" - "stxv 45, 208(%3) \n\t" - "stxv 46, 224(%3) \n\t" - "stxv 47, 240(%3) \n\t" -#else - "stxv 33, 0(%3) \n\t" - "stxv 32, 16(%3) \n\t" - "stxv 35, 32(%3) \n\t" - "stxv 34, 48(%3) \n\t" - "stxv 37, 64(%3) \n\t" - "stxv 36, 80(%3) \n\t" - "stxv 39, 96(%3) \n\t" - "stxv 38, 112(%3) \n\t" - "stxv 41, 128(%3) \n\t" - "stxv 40, 144(%3) \n\t" - "stxv 43, 160(%3) \n\t" - "stxv 42, 176(%3) \n\t" - "stxv 45, 192(%3) \n\t" - "stxv 44, 208(%3) \n\t" - "stxv 47, 224(%3) \n\t" - "stxv 46, 240(%3) \n\t" -#endif - "#n=%1 x=%4=%2 y=%0=%3" - : - "=m" (*y), - "+r" (n), // 1 - "+b" (x), // 2 - "+b" (y) // 3 - : - "m" (*x) - : - "cr0", - "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", - "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" - ); -} diff --git a/kernel/power/ccopy_power10.c b/kernel/power/ccopy_power10.c index 41c510460..a5877cd12 100644 --- a/kernel/power/ccopy_power10.c +++ b/kernel/power/ccopy_power10.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(__VEC__) || defined(__ALTIVEC__) -#include "ccopy_microk_power10.c" +#include "copy_microk_power10.c" #endif #ifndef HAVE_KERNEL @@ -86,7 +86,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1 )) { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -64; if ( n1 > 0 ) { copy_kernel(n1, x, y); diff --git a/kernel/power/copy_microk_power10.c b/kernel/power/copy_microk_power10.c index 8bca1a1e7..8c1c3b073 100644 --- a/kernel/power/copy_microk_power10.c +++ b/kernel/power/copy_microk_power10.c @@ -61,37 +61,97 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) ".align 5 \n" "one%=: \n\t" - "stxvp 32, 0(%3) \n\t" - "stxvp 34, 32(%3) \n\t" - "stxvp 36, 64(%3) \n\t" - "stxvp 38, 96(%3) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 32, 0(%3) \n\t" + "stxv 33, 16(%3) \n\t" + "stxv 34, 32(%3) \n\t" + "stxv 35, 48(%3) \n\t" + "stxv 36, 64(%3) \n\t" + "stxv 37, 80(%3) \n\t" + "stxv 38, 96(%3) \n\t" + "stxv 39, 112(%3) \n\t" +#else + "stxv 33, 0(%3) \n\t" + "stxv 32, 16(%3) \n\t" + "stxv 35, 32(%3) \n\t" + "stxv 34, 48(%3) \n\t" + "stxv 37, 64(%3) \n\t" + "stxv 36, 80(%3) \n\t" + "stxv 39, 96(%3) \n\t" + "stxv 38, 112(%3) \n\t" +#endif "lxvp 32, 0(%2) \n\t" "lxvp 34, 32(%2) \n\t" "lxvp 36, 64(%2) \n\t" "lxvp 38, 96(%2) \n\t" - "stxvp 40, 128(%3) \n\t" - "stxvp 42, 160(%3) \n\t" - "stxvp 44, 192(%3) \n\t" - "stxvp 46, 224(%3) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 40, 128(%3) \n\t" + "stxv 41, 144(%3) \n\t" + "stxv 42, 160(%3) \n\t" + "stxv 43, 176(%3) \n\t" + "stxv 44, 192(%3) \n\t" + "stxv 45, 208(%3) \n\t" + "stxv 46, 224(%3) \n\t" + "stxv 47, 240(%3) \n\t" +#else + "stxv 41, 128(%3) \n\t" + "stxv 40, 144(%3) \n\t" + "stxv 43, 160(%3) \n\t" + "stxv 42, 176(%3) \n\t" + "stxv 45, 192(%3) \n\t" + "stxv 44, 208(%3) \n\t" + "stxv 47, 224(%3) \n\t" + "stxv 46, 240(%3) \n\t" +#endif "lxvp 40, 128(%2) \n\t" "lxvp 42, 160(%2) \n\t" "lxvp 44, 192(%2) \n\t" "lxvp 46, 224(%2) \n\t" - "stxvp 48, 256(%3) \n\t" - "stxvp 50, 288(%3) \n\t" - "stxvp 52, 320(%3) \n\t" - "stxvp 54, 352(%3) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 256(%3) \n\t" + "stxv 49, 272(%3) \n\t" + "stxv 50, 288(%3) \n\t" + "stxv 51, 304(%3) \n\t" + "stxv 52, 320(%3) \n\t" + "stxv 53, 336(%3) \n\t" + "stxv 54, 352(%3) \n\t" + "stxv 55, 368(%3) \n\t" +#else + "stxv 49, 256(%3) \n\t" + "stxv 48, 272(%3) \n\t" + "stxv 51, 288(%3) \n\t" + "stxv 50, 304(%3) \n\t" + "stxv 53, 320(%3) \n\t" + "stxv 52, 336(%3) \n\t" + "stxv 55, 352(%3) \n\t" + "stxv 54, 368(%3) \n\t" +#endif "lxvp 48, 256(%2) \n\t" "lxvp 50, 288(%2) \n\t" "lxvp 52, 320(%2) \n\t" "lxvp 54, 352(%2) \n\t" - "stxvp 56, 384(%3) \n\t" - "stxvp 58, 416(%3) \n\t" - "stxvp 60, 448(%3) \n\t" - "stxvp 62, 480(%3) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 56, 384(%3) \n\t" + "stxv 57, 400(%3) \n\t" + "stxv 58, 416(%3) \n\t" + "stxv 59, 432(%3) \n\t" + "stxv 60, 448(%3) \n\t" + "stxv 61, 464(%3) \n\t" + "stxv 62, 480(%3) \n\t" + "stxv 63, 496(%3) \n\t" +#else + "stxv 57, 384(%3) \n\t" + "stxv 56, 400(%3) \n\t" + "stxv 59, 416(%3) \n\t" + "stxv 58, 432(%3) \n\t" + "stxv 61, 448(%3) \n\t" + "stxv 60, 464(%3) \n\t" + "stxv 63, 480(%3) \n\t" + "stxv 62, 496(%3) \n\t" +#endif "lxvp 56, 384(%2) \n\t" "lxvp 58, 416(%2) \n\t" "lxvp 60, 448(%2) \n\t" @@ -111,22 +171,73 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) "two%=: \n\t" - "stxvp 32, 0(%3) \n\t" - "stxvp 34, 32(%3) \n\t" - "stxvp 36, 64(%3) \n\t" - "stxvp 38, 96(%3) \n\t" - "stxvp 40, 128(%3) \n\t" - "stxvp 42, 160(%3) \n\t" - "stxvp 44, 192(%3) \n\t" - "stxvp 46, 224(%3) \n\t" - "stxvp 48, 256(%3) \n\t" - "stxvp 50, 288(%3) \n\t" - "stxvp 52, 320(%3) \n\t" - "stxvp 54, 352(%3) \n\t" - "stxvp 56, 384(%3) \n\t" - "stxvp 58, 416(%3) \n\t" - "stxvp 60, 448(%3) \n\t" - "stxvp 62, 480(%3) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 32, 0(%3) \n\t" + "stxv 33, 16(%3) \n\t" + "stxv 34, 32(%3) \n\t" + "stxv 35, 48(%3) \n\t" + "stxv 36, 64(%3) \n\t" + "stxv 37, 80(%3) \n\t" + "stxv 38, 96(%3) \n\t" + "stxv 39, 112(%3) \n\t" + "stxv 40, 128(%3) \n\t" + "stxv 41, 144(%3) \n\t" + "stxv 42, 160(%3) \n\t" + "stxv 43, 176(%3) \n\t" + "stxv 44, 192(%3) \n\t" + "stxv 45, 208(%3) \n\t" + "stxv 46, 224(%3) \n\t" + "stxv 47, 240(%3) \n\t" + "stxv 48, 256(%3) \n\t" + "stxv 49, 272(%3) \n\t" + "stxv 50, 288(%3) \n\t" + "stxv 51, 304(%3) \n\t" + "stxv 52, 320(%3) \n\t" + "stxv 53, 336(%3) \n\t" + "stxv 54, 352(%3) \n\t" + "stxv 55, 368(%3) \n\t" + "stxv 56, 384(%3) \n\t" + "stxv 57, 400(%3) \n\t" + "stxv 58, 416(%3) \n\t" + "stxv 59, 432(%3) \n\t" + "stxv 60, 448(%3) \n\t" + "stxv 61, 464(%3) \n\t" + "stxv 62, 480(%3) \n\t" + "stxv 63, 496(%3) \n\t" +#else + "stxv 33, 0(%3) \n\t" + "stxv 32, 16(%3) \n\t" + "stxv 35, 32(%3) \n\t" + "stxv 34, 48(%3) \n\t" + "stxv 37, 64(%3) \n\t" + "stxv 36, 80(%3) \n\t" + "stxv 39, 96(%3) \n\t" + "stxv 38, 112(%3) \n\t" + "stxv 41, 128(%3) \n\t" + "stxv 40, 144(%3) \n\t" + "stxv 43, 160(%3) \n\t" + "stxv 42, 176(%3) \n\t" + "stxv 45, 192(%3) \n\t" + "stxv 44, 208(%3) \n\t" + "stxv 47, 224(%3) \n\t" + "stxv 46, 240(%3) \n\t" + "stxv 49, 256(%3) \n\t" + "stxv 48, 272(%3) \n\t" + "stxv 51, 288(%3) \n\t" + "stxv 50, 304(%3) \n\t" + "stxv 53, 320(%3) \n\t" + "stxv 52, 336(%3) \n\t" + "stxv 55, 352(%3) \n\t" + "stxv 54, 368(%3) \n\t" + "stxv 57, 384(%3) \n\t" + "stxv 56, 400(%3) \n\t" + "stxv 59, 416(%3) \n\t" + "stxv 58, 432(%3) \n\t" + "stxv 61, 448(%3) \n\t" + "stxv 60, 464(%3) \n\t" + "stxv 63, 480(%3) \n\t" + "stxv 62, 496(%3) \n\t" +#endif "#n=%1 x=%4=%2 y=%0=%3" : diff --git a/kernel/power/cscal_microk_power10.c b/kernel/power/cscal_microk_power10.c index d6a91f079..1f4ea288c 100644 --- a/kernel/power/cscal_microk_power10.c +++ b/kernel/power/cscal_microk_power10.c @@ -95,18 +95,38 @@ static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i) "xvaddsp 50, 50, 36 \n\t" "xvaddsp 51, 51, 37 \n\t" - "stxvp 48, 0(%2) \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 0(%2) \n\t" + "stxv 49, 16(%2) \n\t" +#else + "stxv 49, 0(%2) \n\t" + "stxv 48, 16(%2) \n\t" +#endif "xvaddsp 52, 52, 38 \n\t" "xvaddsp 53, 53, 39 \n\t" - "stxvp 50, 32(%2) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 50, 32(%2) \n\t" + "stxv 51, 48(%2) \n\t" +#else + "stxv 51, 32(%2) \n\t" + "stxv 50, 48(%2) \n\t" +#endif "xvaddsp 54, 54, 56 \n\t" "xvaddsp 55, 55, 57 \n\t" - "stxvp 52, 64(%2) \n\t" - "stxvp 54, 96(%2) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 52, 64(%2) \n\t" + "stxv 53, 80(%2) \n\t" + "stxv 54, 96(%2) \n\t" + "stxv 55, 112(%2) \n\t" +#else + "stxv 53, 64(%2) \n\t" + "stxv 52, 80(%2) \n\t" + "stxv 55, 96(%2) \n\t" + "stxv 54, 112(%2) \n\t" +#endif "addi %2, %2, 128 \n\t" @@ -148,18 +168,39 @@ static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i) "xvaddsp 50, 50, 36 \n\t" "xvaddsp 51, 51, 37 \n\t" - "stxvp 48, 0(%2) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 0(%2) \n\t" + "stxv 49, 16(%2) \n\t" +#else + "stxv 49, 0(%2) \n\t" + "stxv 48, 16(%2) \n\t" +#endif "xvaddsp 52, 52, 38 \n\t" "xvaddsp 53, 53, 39 \n\t" - "stxvp 50, 32(%2) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 50, 32(%2) \n\t" + "stxv 51, 48(%2) \n\t" +#else + "stxv 51, 32(%2) \n\t" + "stxv 50, 48(%2) \n\t" +#endif "xvaddsp 54, 54, 56 \n\t" "xvaddsp 55, 55, 57 \n\t" - "stxvp 52, 64(%2) \n\t" - "stxvp 54, 96(%2) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 52, 64(%2) \n\t" + "stxv 53, 80(%2) \n\t" + "stxv 54, 96(%2) \n\t" + "stxv 55, 112(%2) \n\t" +#else + "stxv 53, 64(%2) \n\t" + "stxv 52, 80(%2) \n\t" + "stxv 55, 96(%2) \n\t" + "stxv 54, 112(%2) \n\t" +#endif "#n=%1 x=%0=%2 alpha=(%3,%4)\n" : diff --git a/kernel/power/daxpy_microk_power10.c b/kernel/power/daxpy_microk_power10.c index bc9199efd..f92ea5dda 100644 --- a/kernel/power/daxpy_microk_power10.c +++ b/kernel/power/daxpy_microk_power10.c @@ -60,14 +60,25 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha) "xvmaddadp 37, 33, %x4 \n\t" "lxvp 32, 0(%2) \n\t" - "stxvp 36, 0(%3) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 36, 0(%3) \n\t" + "stxv 37, 16(%3) \n\t" +#else + "stxv 37, 0(%3) \n\t" + "stxv 36, 16(%3) \n\t" +#endif "xvmaddadp 38, 34, %x4 \n\t" "xvmaddadp 39, 35, %x4 \n\t" "lxvp 34, 32(%2) \n\t" - "stxvp 38, 32(%3) \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 38, 32(%3) \n\t" + "stxv 39, 48(%3) \n\t" +#else + "stxv 39, 32(%3) \n\t" + "stxv 38, 48(%3) \n\t" +#endif "lxvp 36, 128(%3) \n\t" "lxvp 38, 160(%3) \n\t" @@ -76,13 +87,25 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha) "xvmaddadp 45, 41, %x4 \n\t" "lxvp 40, 64(%2) \n\t" - "stxvp 44, 64(%3) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 44, 64(%3) \n\t" + "stxv 45, 80(%3) \n\t" +#else + "stxv 45, 64(%3) \n\t" + "stxv 44, 80(%3) \n\t" +#endif "xvmaddadp 46, 42, %x4 \n\t" "xvmaddadp 47, 43, %x4 \n\t" "lxvp 42, 96(%2) \n\t" - "stxvp 46, 96(%3) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 46, 96(%3) \n\t" + "stxv 47, 112(%3) \n\t" +#else + "stxv 47, 96(%3) \n\t" + "stxv 46, 112(%3) \n\t" +#endif "addi %2, %2, 128 \n\t" "addi %3, %3, 128 \n\t" @@ -105,10 +128,25 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha) "xvmaddadp 46, 42, %x4 \n\t" "xvmaddadp 47, 43, %x4 \n\t" - "stxvp 36, 0(%3) \n\t" - "stxvp 38, 32(%3) \n\t" - "stxvp 44, 64(%3) \n\t" - "stxvp 46, 96(%3) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 36, 0(%3) \n\t" + "stxv 37, 16(%3) \n\t" + "stxv 38, 32(%3) \n\t" + "stxv 39, 48(%3) \n\t" + "stxv 44, 64(%3) \n\t" + "stxv 45, 80(%3) \n\t" + "stxv 46, 96(%3) \n\t" + "stxv 47, 112(%3) \n\t" +#else + "stxv 37, 0(%3) \n\t" + "stxv 36, 16(%3) \n\t" + "stxv 39, 32(%3) \n\t" + "stxv 38, 48(%3) \n\t" + "stxv 45, 64(%3) \n\t" + "stxv 44, 80(%3) \n\t" + "stxv 47, 96(%3) \n\t" + "stxv 46, 112(%3) \n\t" +#endif "#n=%1 x=%5=%2 y=%0=%3 alpha=%6 t0=%x4\n" : diff --git a/kernel/power/daxpy_power10.c b/kernel/power/daxpy_power10.c index 8640efcfd..cfa6fd73d 100644 --- a/kernel/power/daxpy_power10.c +++ b/kernel/power/daxpy_power10.c @@ -68,7 +68,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( n >= 16 ) { - BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; for (i = 0; i < align; i++) { y[i] += da * x[i] ; } diff --git a/kernel/power/dcopy_power10.c b/kernel/power/dcopy_power10.c index 6c5eb4d77..78e636b6a 100644 --- a/kernel/power/dcopy_power10.c +++ b/kernel/power/dcopy_power10.c @@ -87,7 +87,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { if ( n >= 64 ) { - BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; for (i = 0; i < align; i++) { y[i] = x[i] ; } diff --git a/kernel/power/dscal_microk_power10.c b/kernel/power/dscal_microk_power10.c index d0d506f24..a788640b5 100644 --- a/kernel/power/dscal_microk_power10.c +++ b/kernel/power/dscal_microk_power10.c @@ -59,10 +59,25 @@ static void dscal_kernel_8 (long n, double *x, double alpha) "lxvp 36, 192(%2) \n\t" "lxvp 38, 224(%2) \n\t" - "stxvp 40, 0(%2) \n\t" - "stxvp 42, 32(%2) \n\t" - "stxvp 44, 64(%2) \n\t" - "stxvp 46, 96(%2) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 40, 0(%2) \n\t" + "stxv 41, 16(%2) \n\t" + "stxv 42, 32(%2) \n\t" + "stxv 43, 48(%2) \n\t" + "stxv 44, 64(%2) \n\t" + "stxv 45, 80(%2) \n\t" + "stxv 46, 96(%2) \n\t" + "stxv 47, 112(%2) \n\t" +#else + "stxv 41, 0(%2) \n\t" + "stxv 40, 16(%2) \n\t" + "stxv 43, 32(%2) \n\t" + "stxv 42, 48(%2) \n\t" + "stxv 45, 64(%2) \n\t" + "stxv 44, 80(%2) \n\t" + "stxv 47, 96(%2) \n\t" + "stxv 46, 112(%2) \n\t" +#endif "addi %2, %2, 128 \n\t" @@ -81,10 +96,25 @@ static void dscal_kernel_8 (long n, double *x, double alpha) "xvmuldp 46, 38, 48 \n\t" "xvmuldp 47, 39, 48 \n\t" - "stxvp 40, 0(%2) \n\t" - "stxvp 42, 32(%2) \n\t" - "stxvp 44, 64(%2) \n\t" - "stxvp 46, 96(%2) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 40, 0(%2) \n\t" + "stxv 41, 16(%2) \n\t" + "stxv 42, 32(%2) \n\t" + "stxv 43, 48(%2) \n\t" + "stxv 44, 64(%2) \n\t" + "stxv 45, 80(%2) \n\t" + "stxv 46, 96(%2) \n\t" + "stxv 47, 112(%2) \n\t" +#else + "stxv 41, 0(%2) \n\t" + "stxv 40, 16(%2) \n\t" + "stxv 43, 32(%2) \n\t" + "stxv 42, 48(%2) \n\t" + "stxv 45, 64(%2) \n\t" + "stxv 44, 80(%2) \n\t" + "stxv 47, 96(%2) \n\t" + "stxv 46, 112(%2) \n\t" +#endif "#n=%1 alpha=%3 x=%0=%2" : @@ -112,10 +142,14 @@ static void dscal_kernel_8_zero (long n, double *x) ".align 5 \n" "one%=: \n\t" - "stxvp 32, 0(%2) \n\t" - "stxvp 32, 32(%2) \n\t" - "stxvp 32, 64(%2) \n\t" - "stxvp 32, 96(%2) \n\t" + "stxv 32, 0(%2) \n\t" + "stxv 32, 16(%2) \n\t" + "stxv 32, 32(%2) \n\t" + "stxv 32, 48(%2) \n\t" + "stxv 32, 64(%2) \n\t" + "stxv 32, 80(%2) \n\t" + "stxv 32, 96(%2) \n\t" + "stxv 32, 112(%2) \n\t" "addi %2, %2, 128 \n\t" diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c index 9e6229c6a..192a76eec 100644 --- a/kernel/power/dswap.c +++ b/kernel/power/dswap.c @@ -120,7 +120,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, #if defined(POWER10) if ( n >= 32 ) { - BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; for (i = 0; i < align; i++) { temp = y[i]; y[i] = x[i]; diff --git a/kernel/power/saxpy_microk_power10.c b/kernel/power/saxpy_microk_power10.c index 6ede1dcdd..cf5f45959 100644 --- a/kernel/power/saxpy_microk_power10.c +++ b/kernel/power/saxpy_microk_power10.c @@ -67,13 +67,25 @@ static void saxpy_kernel_64(long n, float *x, float *y, float alpha) "xvmaddasp 37, 33, %x4 \n\t" "lxvp 32, 0(%2) \n\t" - "stxvp 36, 0(%3) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 36, 0(%3) \n\t" + "stxv 37, 16(%3) \n\t" +#else + "stxv 37, 0(%3) \n\t" + "stxv 36, 16(%3) \n\t" +#endif "xvmaddasp 38, 34, %x4 \n\t" "xvmaddasp 39, 35, %x4 \n\t" "lxvp 34, 32(%2) \n\t" - "stxvp 38, 32(%3) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 38, 32(%3) \n\t" + "stxv 39, 48(%3) \n\t" +#else + "stxv 39, 32(%3) \n\t" + "stxv 38, 48(%3) \n\t" +#endif "lxvp 36, 256(%3) \n\t" "lxvp 38, 288(%3) \n\t" @@ -82,13 +94,25 @@ static void saxpy_kernel_64(long n, float *x, float *y, float alpha) "xvmaddasp 45, 41, %x4 \n\t" "lxvp 40, 64(%2) \n\t" - "stxvp 44, 64(%3) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 44, 64(%3) \n\t" + "stxv 45, 80(%3) \n\t" +#else + "stxv 45, 64(%3) \n\t" + "stxv 44, 80(%3) \n\t" +#endif "xvmaddasp 46, 42, %x4 \n\t" "xvmaddasp 47, 43, %x4 \n\t" "lxvp 42, 96(%2) \n\t" - "stxvp 46, 96(%3) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 46, 96(%3) \n\t" + "stxv 47, 112(%3) \n\t" +#else + "stxv 47, 96(%3) \n\t" + "stxv 46, 112(%3) \n\t" +#endif "lxvp 44, 320(%3) \n\t" "lxvp 46, 352(%3) \n\t" @@ -97,13 +121,25 @@ static void saxpy_kernel_64(long n, float *x, float *y, float alpha) "xvmaddasp 57, 49, %x4 \n\t" "lxvp 48, 128(%2) \n\t" - "stxvp 56, 128(%3) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 56, 128(%3) \n\t" + "stxv 57, 144(%3) \n\t" +#else + "stxv 57, 128(%3) \n\t" + "stxv 56, 144(%3) \n\t" +#endif "xvmaddasp 58, 50, %x4 \n\t" "xvmaddasp 59, 51, %x4 \n\t" "lxvp 50, 160(%2) \n\t" - "stxvp 58, 160(%3) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 58, 160(%3) \n\t" + "stxv 59, 176(%3) \n\t" +#else + "stxv 59, 160(%3) \n\t" + "stxv 58, 176(%3) \n\t" +#endif "lxvp 56, 384(%3) \n\t" "lxvp 58, 416(%3) \n\t" @@ -112,13 +148,25 @@ static void saxpy_kernel_64(long n, float *x, float *y, float alpha) "xvmaddasp 61, 53, %x4 \n\t" "lxvp 52, 192(%2) \n\t" - "stxvp 60, 192(%3) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 60, 192(%3) \n\t" + "stxv 61, 208(%3) \n\t" +#else + "stxv 61, 192(%3) \n\t" + "stxv 60, 208(%3) \n\t" +#endif "xvmaddasp 62, 54, %x4 \n\t" "xvmaddasp 63, 55, %x4 \n\t" "lxvp 54, 224(%2) \n\t" - "stxvp 62, 224(%3) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 62, 224(%3) \n\t" + "stxv 63, 240(%3) \n\t" +#else + "stxv 63, 224(%3) \n\t" + "stxv 62, 240(%3) \n\t" +#endif "lxvp 60, 448(%3) \n\t" "lxvp 62, 480(%3) \n\t" @@ -150,14 +198,43 @@ static void saxpy_kernel_64(long n, float *x, float *y, float alpha) "xvmaddasp 61, 53, %x4 \n\t" "xvmaddasp 62, 54, %x4 \n\t" "xvmaddasp 63, 55, %x4 \n\t" - "stxvp 36, 0(%3) \n\t" - "stxvp 38, 32(%3) \n\t" - "stxvp 44, 64(%3) \n\t" - "stxvp 46, 96(%3) \n\t" - "stxvp 56, 128(%3) \n\t" - "stxvp 58, 160(%3) \n\t" - "stxvp 60, 192(%3) \n\t" - "stxvp 62, 224(%3) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 36, 0(%3) \n\t" + "stxv 37, 16(%3) \n\t" + "stxv 38, 32(%3) \n\t" + "stxv 39, 48(%3) \n\t" + "stxv 44, 64(%3) \n\t" + "stxv 45, 80(%3) \n\t" + "stxv 46, 96(%3) \n\t" + "stxv 47, 112(%3) \n\t" + + "stxv 56, 128(%3) \n\t" + "stxv 57, 144(%3) \n\t" + "stxv 58, 160(%3) \n\t" + "stxv 59, 176(%3) \n\t" + "stxv 60, 192(%3) \n\t" + "stxv 61, 208(%3) \n\t" + "stxv 62, 224(%3) \n\t" + "stxv 63, 240(%3) \n\t" +#else + "stxv 37, 0(%3) \n\t" + "stxv 36, 16(%3) \n\t" + "stxv 39, 32(%3) \n\t" + "stxv 38, 48(%3) \n\t" + "stxv 45, 64(%3) \n\t" + "stxv 44, 80(%3) \n\t" + "stxv 47, 96(%3) \n\t" + "stxv 46, 112(%3) \n\t" + + "stxv 57, 128(%3) \n\t" + "stxv 56, 144(%3) \n\t" + "stxv 59, 160(%3) \n\t" + "stxv 58, 176(%3) \n\t" + "stxv 61, 192(%3) \n\t" + "stxv 60, 208(%3) \n\t" + "stxv 63, 224(%3) \n\t" + "stxv 62, 240(%3) \n\t" +#endif "#n=%1 x=%5=%2 y=%0=%3 t0=%x4\n" : diff --git a/kernel/power/saxpy_power10.c b/kernel/power/saxpy_power10.c index 4a13c1f88..302b2418e 100644 --- a/kernel/power/saxpy_power10.c +++ b/kernel/power/saxpy_power10.c @@ -66,7 +66,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( n >= 64 ) { - BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; for (i = 0; i < align; i++) { y[i] += da * x[i] ; } diff --git a/kernel/power/scopy_power10.c b/kernel/power/scopy_power10.c index 3398ce827..73ca192bf 100644 --- a/kernel/power/scopy_power10.c +++ b/kernel/power/scopy_power10.c @@ -88,7 +88,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( n >= 128 ) { - BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; for (i = 0; i < align; i++) { y[i] = x[i] ; } diff --git a/kernel/power/sscal_microk_power10.c b/kernel/power/sscal_microk_power10.c index a523a1675..76703325c 100644 --- a/kernel/power/sscal_microk_power10.c +++ b/kernel/power/sscal_microk_power10.c @@ -60,10 +60,25 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "lxvp 36, 192(%2) \n\t" "lxvp 38, 224(%2) \n\t" - "stxvp 40, 0(%2) \n\t" - "stxvp 42, 32(%2) \n\t" - "stxvp 44, 64(%2) \n\t" - "stxvp 46, 96(%2) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 40, 0(%2) \n\t" + "stxv 41, 16(%2) \n\t" + "stxv 42, 32(%2) \n\t" + "stxv 43, 48(%2) \n\t" + "stxv 44, 64(%2) \n\t" + "stxv 45, 80(%2) \n\t" + "stxv 46, 96(%2) \n\t" + "stxv 47, 112(%2) \n\t" +#else + "stxv 41, 0(%2) \n\t" + "stxv 40, 16(%2) \n\t" + "stxv 43, 32(%2) \n\t" + "stxv 42, 48(%2) \n\t" + "stxv 45, 64(%2) \n\t" + "stxv 44, 80(%2) \n\t" + "stxv 47, 96(%2) \n\t" + "stxv 46, 112(%2) \n\t" +#endif "addi %2, %2, 128 \n\t" @@ -82,10 +97,25 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "xvmulsp 46, 38, 48 \n\t" "xvmulsp 47, 39, 48 \n\t" - "stxvp 40, 0(%2) \n\t" - "stxvp 42, 32(%2) \n\t" - "stxvp 44, 64(%2) \n\t" - "stxvp 46, 96(%2) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 40, 0(%2) \n\t" + "stxv 41, 16(%2) \n\t" + "stxv 42, 32(%2) \n\t" + "stxv 43, 48(%2) \n\t" + "stxv 44, 64(%2) \n\t" + "stxv 45, 80(%2) \n\t" + "stxv 46, 96(%2) \n\t" + "stxv 47, 112(%2) \n\t" +#else + "stxv 41, 0(%2) \n\t" + "stxv 40, 16(%2) \n\t" + "stxv 43, 32(%2) \n\t" + "stxv 42, 48(%2) \n\t" + "stxv 45, 64(%2) \n\t" + "stxv 44, 80(%2) \n\t" + "stxv 47, 96(%2) \n\t" + "stxv 46, 112(%2) \n\t" +#endif "#n=%1 alpha=%3 x=%0=%2" : @@ -113,10 +143,14 @@ static void sscal_kernel_16_zero (long n, float *x) ".align 5 \n" "one%=: \n\t" - "stxvp 32, 0(%2) \n\t" - "stxvp 32, 32(%2) \n\t" - "stxvp 32, 64(%2) \n\t" - "stxvp 32, 96(%2) \n\t" + "stxv 32, 0(%2) \n\t" + "stxv 32, 16(%2) \n\t" + "stxv 32, 32(%2) \n\t" + "stxv 32, 48(%2) \n\t" + "stxv 32, 64(%2) \n\t" + "stxv 32, 80(%2) \n\t" + "stxv 32, 96(%2) \n\t" + "stxv 32, 112(%2) \n\t" "addi %2, %2, 128 \n\t" diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c index dd249fd36..3dfb10edd 100644 --- a/kernel/power/sswap.c +++ b/kernel/power/sswap.c @@ -120,7 +120,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, #if defined(POWER10) if ( n >= 64 ) { - BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; for (i = 0; i < align; i++) { temp = y[i]; y[i] = x[i]; diff --git a/kernel/power/swap_microk_power10.c b/kernel/power/swap_microk_power10.c index f9c1fee52..118adee5f 100644 --- a/kernel/power/swap_microk_power10.c +++ b/kernel/power/swap_microk_power10.c @@ -57,25 +57,79 @@ static void sswap_kernel_32 (long n, float *x, float *y) "lxvp 60, 192(%3) \n\t" "lxvp 62, 224(%3) \n\t" - "stxvp 32, 0(%3) \n\t" - "stxvp 34, 32(%3) \n\t" - "stxvp 36, 64(%3) \n\t" - "stxvp 38, 96(%3) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 32, 0(%3) \n\t" + "stxv 33, 16(%3) \n\t" + "stxv 34, 32(%3) \n\t" + "stxv 35, 48(%3) \n\t" + "stxv 36, 64(%3) \n\t" + "stxv 37, 80(%3) \n\t" + "stxv 38, 96(%3) \n\t" + "stxv 39, 112(%3) \n\t" - "stxvp 40, 128(%3) \n\t" - "stxvp 42, 160(%3) \n\t" - "stxvp 44, 192(%3) \n\t" - "stxvp 46, 224(%3) \n\t" + "stxv 40, 128(%3) \n\t" + "stxv 41, 144(%3) \n\t" + "stxv 42, 160(%3) \n\t" + "stxv 43, 176(%3) \n\t" + "stxv 44, 192(%3) \n\t" + "stxv 45, 208(%3) \n\t" + "stxv 46, 224(%3) \n\t" + "stxv 47, 240(%3) \n\t" - "stxvp 48, 0(%4) \n\t" - "stxvp 50, 32(%4) \n\t" - "stxvp 52, 64(%4) \n\t" - "stxvp 54, 96(%4) \n\t" + "stxv 48, 0(%4) \n\t" + "stxv 49, 16(%4) \n\t" + "stxv 50, 32(%4) \n\t" + "stxv 51, 48(%4) \n\t" + "stxv 52, 64(%4) \n\t" + "stxv 53, 80(%4) \n\t" + "stxv 54, 96(%4) \n\t" + "stxv 55, 112(%4) \n\t" - "stxvp 56, 128(%4) \n\t" - "stxvp 58, 160(%4) \n\t" - "stxvp 60, 192(%4) \n\t" - "stxvp 62, 224(%4) \n\t" + "stxv 56, 128(%4) \n\t" + "stxv 57, 144(%4) \n\t" + "stxv 58, 160(%4) \n\t" + "stxv 59, 176(%4) \n\t" + "stxv 60, 192(%4) \n\t" + "stxv 61, 208(%4) \n\t" + "stxv 62, 224(%4) \n\t" + "stxv 63, 240(%4) \n\t" +#else + "stxv 33, 0(%3) \n\t" + "stxv 32, 16(%3) \n\t" + "stxv 35, 32(%3) \n\t" + "stxv 34, 48(%3) \n\t" + "stxv 37, 64(%3) \n\t" + "stxv 36, 80(%3) \n\t" + "stxv 39, 96(%3) \n\t" + "stxv 38, 112(%3) \n\t" + + "stxv 41, 128(%3) \n\t" + "stxv 40, 144(%3) \n\t" + "stxv 43, 160(%3) \n\t" + "stxv 42, 176(%3) \n\t" + "stxv 45, 192(%3) \n\t" + "stxv 44, 208(%3) \n\t" + "stxv 47, 224(%3) \n\t" + "stxv 46, 240(%3) \n\t" + + "stxv 49, 0(%4) \n\t" + "stxv 48, 16(%4) \n\t" + "stxv 51, 32(%4) \n\t" + "stxv 50, 48(%4) \n\t" + "stxv 53, 64(%4) \n\t" + "stxv 52, 80(%4) \n\t" + "stxv 55, 96(%4) \n\t" + "stxv 54, 112(%4) \n\t" + + "stxv 57, 128(%4) \n\t" + "stxv 56, 144(%4) \n\t" + "stxv 59, 160(%4) \n\t" + "stxv 58, 176(%4) \n\t" + "stxv 61, 192(%4) \n\t" + "stxv 60, 208(%4) \n\t" + "stxv 63, 224(%4) \n\t" + "stxv 62, 240(%4) \n\t" +#endif "addi %4, %4, 256 \n\t" "addi %3, %3, 256 \n\t" diff --git a/kernel/power/zaxpy_microk_power10.c b/kernel/power/zaxpy_microk_power10.c index b03508b09..366c7ed62 100644 --- a/kernel/power/zaxpy_microk_power10.c +++ b/kernel/power/zaxpy_microk_power10.c @@ -125,10 +125,25 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, "xvmaddadp 38, %x10, 33 \n\t" "xvmaddadp 39, %x11, 33 \n\t" - "stxvp 48, 0(%12) \n\t" - "stxvp 50, 32(%12) \n\t" - "stxvp 34, 64(%12) \n\t" - "stxvp 38, 96(%12) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 0(%12) \n\t" + "stxv 49, 16(%12) \n\t" + "stxv 50, 32(%12) \n\t" + "stxv 51, 48(%12) \n\t" + "stxv 34, 64(%12) \n\t" + "stxv 35, 80(%12) \n\t" + "stxv 38, 96(%12) \n\t" + "stxv 39, 112(%12) \n\t" +#else + "stxv 49, 0(%12) \n\t" + "stxv 48, 16(%12) \n\t" + "stxv 51, 32(%12) \n\t" + "stxv 50, 48(%12) \n\t" + "stxv 35, 64(%12) \n\t" + "stxv 34, 80(%12) \n\t" + "stxv 39, 96(%12) \n\t" + "stxv 38, 112(%12) \n\t" +#endif "addi %12, %12, 128 \n\t" @@ -172,10 +187,25 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, "xvmaddadp 38, %x10, 33 \n\t" "xvmaddadp 39, %x11, 33 \n\t" - "stxvp 48, 0(%12) \n\t" - "stxvp 50, 32(%12) \n\t" - "stxvp 34, 64(%12) \n\t" - "stxvp 38, 96(%12) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 0(%12) \n\t" + "stxv 49, 16(%12) \n\t" + "stxv 50, 32(%12) \n\t" + "stxv 51, 48(%12) \n\t" + "stxv 34, 64(%12) \n\t" + "stxv 35, 80(%12) \n\t" + "stxv 38, 96(%12) \n\t" + "stxv 39, 112(%12) \n\t" +#else + "stxv 49, 0(%12) \n\t" + "stxv 48, 16(%12) \n\t" + "stxv 51, 32(%12) \n\t" + "stxv 50, 48(%12) \n\t" + "stxv 35, 64(%12) \n\t" + "stxv 34, 80(%12) \n\t" + "stxv 39, 96(%12) \n\t" + "stxv 38, 112(%12) \n\t" +#endif "#n=%1 x=%13=%2 y=%0=%3 alpha=(%15,%16) mvecp=%14=%17 ytmp=%12\n" "#t0=%x4 t1=%x5 t2=%x6 t3=%x7 t4=%x8 t5=%x9 t6=%x10 t7=%x11"