From 2619ad7ea5505a0d24a20d26be2ab173237a11d8 Mon Sep 17 00:00:00 2001 From: the mslm Date: Wed, 17 Jan 2018 18:05:44 -0800 Subject: [PATCH] Blas1 mikrokernels can be inlined by gcc. Refactoring ( symbolic operan names). Some fixes and tunings --- kernel/zarch/dasum.c | 28 +-- kernel/zarch/daxpy.c | 289 +++++------------------------- kernel/zarch/dcopy.c | 113 ++++++------ kernel/zarch/ddot.c | 109 ++++++----- kernel/zarch/drot.c | 260 +++++++++++++-------------- kernel/zarch/dscal.c | 141 ++++++++------- kernel/zarch/dswap.c | 408 ++++++++++++++++-------------------------- kernel/zarch/idamax.c | 95 +++++----- kernel/zarch/idamin.c | 63 +++---- kernel/zarch/izamax.c | 192 ++++++++++---------- kernel/zarch/izamin.c | 234 ++++++++++++------------ kernel/zarch/zasum.c | 106 +++++------ kernel/zarch/zaxpy.c | 139 +++++++------- kernel/zarch/zcopy.c | 92 +++++----- kernel/zarch/zdot.c | 132 +++++++------- kernel/zarch/zrot.c | 259 +++++++++++++-------------- kernel/zarch/zscal.c | 347 +++++++++++++++++------------------ kernel/zarch/zswap.c | 270 ++++++++++++++++++++-------- 18 files changed, 1535 insertions(+), 1742 deletions(-) diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index 46dfcc08a..7a42a0863 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -39,19 +39,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { - FLOAT asum ; + FLOAT asum ; __asm__ ( - "pfd 1, 0(%3) \n\t" - "sllg %%r0,%2,3 \n\t" - "agr %%r0,%3 \n\t" + "pfd 1, 0(%[ptr_x]) \n\t" + "sllg %%r0,%[n],3 \n\t" + "agr %%r0,%[ptr_x] \n\t" "vzero %%v0 \n\t" "vzero %%v1 \n\t" "vzero %%v2 \n\t" "vzero %%v3 \n\t" ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%1 ) \n\t" - "vlm %%v24,%%v31, 0(%1 ) \n\t" + "1: \n\t" + "pfd 1, 256(%[ptr_temp] ) \n\t" + "vlm %%v24,%%v31, 0(%[ptr_temp] ) \n\t" "vflpdb %%v24, %%v24 \n\t" "vflpdb %%v25, %%v25 \n\t" @@ -71,7 +71,7 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { "vfadb %%v2,%%v2,%%v30 \n\t" "vfadb %%v3,%%v3,%%v31 \n\t" - "vlm %%v24,%%v31, 128(%1) \n\t" + "vlm %%v24,%%v31, 128(%[ptr_temp]) \n\t" "vflpdb %%v24, %%v24 \n\t" "vflpdb %%v25, %%v25 \n\t" @@ -81,7 +81,7 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { "vflpdb %%v29, %%v29 \n\t" "vflpdb %%v30, %%v30 \n\t" "vflpdb %%v31, %%v31 \n\t" - "la %1,256(%1) \n\t" + "la %[ptr_temp],256(%[ptr_temp]) \n\t" "vfadb %%v0,%%v0,%%v24 \n\t" "vfadb %%v1,%%v1,%%v25 \n\t" "vfadb %%v2,%%v2,%%v26 \n\t" @@ -91,16 +91,16 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { "vfadb %%v2,%%v2,%%v30 \n\t" "vfadb %%v3,%%v3,%%v31 \n\t" - "clgrjl %1,%%r0,1b \n\t" + "clgrjl %[ptr_temp],%%r0,1b \n\t" "vfadb %%v24,%%v0,%%v1 \n\t" "vfadb %%v25,%%v2,%%v3 \n\t" "vfadb %%v0,%%v25,%%v24 \n\t" "vrepg %%v1,%%v0,1 \n\t" "adbr %%f0,%%f1 \n\t" - "ldr %0,%%f0 \n\t" - : "=f"(asum),"+&a"(x) - : "r"(n), "1"(x) - : "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31" + "ldr %[asum],%%f0 \n\t" + : [asum] "=f"(asum),[ptr_temp] "+&a"(x) + : [mem] "m"( *(const double (*)[n])x ), [n] "r"(n), [ptr_x] "a"(x) + : "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31" ); return asum; diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index b6cbdfee8..16f82a587 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -27,15 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#define Z13_D 1 + #define PREFETCH_INS 1 #if defined(Z13_A) #include -static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) { BLASLONG i = 0; - __vector double v_a = {*alpha,*alpha}; + __vector double v_a = {alpha,alpha}; __vector double * v_y=(__vector double *)y; __vector double * v_x=(__vector double *)x; @@ -60,256 +60,53 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) } } -#elif defined(Z13_B) -static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - - __asm__ volatile( -#if defined(PREFETCH_INS) - "pfd 1, 0(%1) \n\t" - "pfd 2, 0(%2) \n\t" -#endif - "vlrepg %%v0 , 0(%3) \n\t" - "srlg %3,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "vlr %%v1,%%v0 \n\t" - ".align 16 \n\t" - "1: \n\t" -#if defined(PREFETCH_INS) - "pfd 1, 256(%%r1,%1) \n\t" - "pfd 2, 256(%%r1,%2) \n\t" -#endif - - "vl %%v24, 0(%%r1,%2) \n\t" - "vl %%v16, 0(%%r1,%1) \n\t" - "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" - "vst %%v16, 0(%%r1,%2) \n\t" - "vl %%v25, 16(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vfmadb %%v17,%%v0,%%v17,%%v25 \n\t" - "vst %%v17, 16(%%r1,%2) \n\t" - "vl %%v26, 32(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" - "vst %%v18, 32(%%r1,%2) \n\t" - "vl %%v27, 48(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vfmadb %%v19,%%v0,%%v19,%%v27 \n\t" - "vst %%v19, 48(%%r1,%2) \n\t" - - "vl %%v24,( 0+64)(%%r1,%2) \n\t" - "vl %%v16,( 0+64)(%%r1,%1) \n\t" - "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" - "vst %%v16,( 0+64)(%%r1,%2) \n\t" - "vl %%v25, (16+64)(%%r1,%2) \n\t" - "vl %%v17, (16+64)(%%r1,%1) \n\t" - "vfmadb %%v17,%%v0,%%v17,%%v25 \n\t" - "vst %%v17, (16+64)(%%r1,%2) \n\t" - "vl %%v26, (32+64)(%%r1,%2) \n\t" - "vl %%v18, (32+64)(%%r1,%1) \n\t" - "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" - "vst %%v18, (32+64)(%%r1,%2) \n\t" - "vl %%v27, (48+64)(%%r1,%2) \n\t" - "vl %%v19, (48+64)(%%r1,%1) \n\t" - "vfmadb %%v19,%%v0,%%v19,%%v27 \n\t" - "vst %%v19, (48+64)(%%r1,%2) \n\t" - - "vl %%v24,( 0+128)(%%r1,%2) \n\t" - "vl %%v16,( 0+128)(%%r1,%1) \n\t" - "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" - "vst %%v16,( 0+128)(%%r1,%2) \n\t" - "vl %%v25, (16+128)(%%r1,%2) \n\t" - "vl %%v17, (16+128)(%%r1,%1) \n\t" - "vfmadb %%v17,%%v0,%%v17,%%v25 \n\t" - "vst %%v17, (16+128)(%%r1,%2) \n\t" - "vl %%v26, (32+128)(%%r1,%2) \n\t" - "vl %%v18, (32+128)(%%r1,%1) \n\t" - "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" - "vst %%v18, (32+128)(%%r1,%2) \n\t" - "vl %%v27, (48+128)(%%r1,%2) \n\t" - "vl %%v19, (48+128)(%%r1,%1) \n\t" - "vfmadb %%v19,%%v0,%%v19,%%v27 \n\t" - "vst %%v19, (48+128)(%%r1,%2) \n\t" - - "vl %%v24,( 0+192)(%%r1,%2) \n\t" - "vl %%v16,( 0+192)(%%r1,%1) \n\t" - "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" - "vst %%v16,( 0+192)(%%r1,%2) \n\t" - "vl %%v25, (16+192)(%%r1,%2) \n\t" - "vl %%v17, (16+192)(%%r1,%1) \n\t" - "vfmadb %%v17,%%v0,%%v17,%%v25 \n\t" - "vst %%v17, (16+192)(%%r1,%2) \n\t" - "vl %%v26, (32+192)(%%r1,%2) \n\t" - "vl %%v18, (32+192)(%%r1,%1) \n\t" - "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" - "vst %%v18, (32+192)(%%r1,%2) \n\t" - "vl %%v27, (48+192)(%%r1,%2) \n\t" - "vl %%v19, (48+192)(%%r1,%1) \n\t" - "vfmadb %%v19,%%v0,%%v19,%%v27 \n\t" - "vst %%v19, (48+192)(%%r1,%2) \n\t" - - - "la %%r1,256(%%r1) \n\t" - "brctg %3,1b" - : - :"r"(n),"a"(x),"a"(y),"a"(alpha) - :"cc", "memory", "r1" ,"v0" ,"v16","v17","v18","v19", "v24","v25","v26","v27" - ); -} - -#elif defined(Z13_C) -static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - __asm__ volatile( -#if defined(PREFETCH_INS) - "pfd 1, 0(%1) \n\t" - "pfd 2, 0(%2) \n\t" -#endif - "vlrepg %%v0 , 0(%3) \n\t" - "srlg %3,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "vlr %%v1,%%v0 \n\t" - ".align 16 \n\t" - "1: \n\t" -#if defined(PREFETCH_INS) - "pfd 1, 256(%%r1,%1) \n\t" - "pfd 2, 256(%%r1,%2) \n\t" -#endif - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - - "vl %%v24, 0(%%r1,%2) \n\t" - "vl %%v25, 16(%%r1,%2) \n\t" - "vl %%v26, 32(%%r1,%2) \n\t" - "vl %%v27, 48(%%r1,%2) \n\t" - "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" - "vfmadb %%v17,%%v1,%%v17,%%v25 \n\t" - "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" - "vfmadb %%v19,%%v1,%%v19,%%v27 \n\t" - "vst %%v16, 0(%%r1,%2) \n\t" - "vst %%v17, 16(%%r1,%2) \n\t" - "vst %%v18, 32(%%r1,%2) \n\t" - "vst %%v19, 48(%%r1,%2) \n\t" - - "vl %%v24, 64(%%r1,%1) \n\t" - "vl %%v25, 80(%%r1,%1) \n\t" - "vl %%v26, 96(%%r1,%1) \n\t" - "vl %%v27, 112(%%r1,%1) \n\t" - - "vl %%v16, 64(%%r1,%2) \n\t" - "vl %%v17, 80(%%r1,%2) \n\t" - "vl %%v18, 96(%%r1,%2) \n\t" - "vl %%v19, 112(%%r1,%2) \n\t" - - - "vfmadb %%v24,%%v0,%%v24,%%v16 \n\t" - "vfmadb %%v25,%%v1,%%v25,%%v17 \n\t" - "vfmadb %%v26,%%v0,%%v26,%%v18 \n\t" - "vfmadb %%v27,%%v1,%%v27,%%v19 \n\t" - - "vst %%v24, 64(%%r1,%2) \n\t" - "vst %%v25, 80(%%r1,%2) \n\t" - "vst %%v26, 96(%%r1,%2) \n\t" - "vst %%v27, 112(%%r1,%2) \n\t" - - "vl %%v16, (0+128)(%%r1,%1) \n\t" - "vl %%v17, (16+128)(%%r1,%1) \n\t" - "vl %%v18, (32+128)(%%r1,%1) \n\t" - "vl %%v19, (48+128)(%%r1,%1) \n\t" - - "vl %%v24, (0+128)(%%r1,%2) \n\t" - "vl %%v25, (16+128)(%%r1,%2) \n\t" - "vl %%v26, (32+128)(%%r1,%2) \n\t" - "vl %%v27, (48+128)(%%r1,%2) \n\t" - - "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" - "vfmadb %%v17,%%v1,%%v17,%%v25 \n\t" - "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" - "vfmadb %%v19,%%v1,%%v19,%%v27 \n\t" - "vst %%v16, (0+128)(%%r1,%2) \n\t" - "vst %%v17, (16+128)(%%r1,%2) \n\t" - "vst %%v18, (32+128)(%%r1,%2) \n\t" - "vst %%v19, (48+128)(%%r1,%2) \n\t" - - "vl %%v24, (64+128)(%%r1,%1) \n\t" - "vl %%v25, (80+128)(%%r1,%1) \n\t" - "vl %%v26, (96+128)(%%r1,%1) \n\t" - "vl %%v27, (112+128)(%%r1,%1) \n\t" - - "vl %%v16, (64+128)(%%r1,%2) \n\t" - "vl %%v17, (80+128)(%%r1,%2) \n\t" - "vl %%v18, (96+128)(%%r1,%2) \n\t" - "vl %%v19, (112+128)(%%r1,%2) \n\t" - - "vfmadb %%v24,%%v0,%%v24,%%v16 \n\t" - "vfmadb %%v25,%%v1,%%v25,%%v17 \n\t" - "vfmadb %%v26,%%v0,%%v26,%%v18 \n\t" - "vfmadb %%v27,%%v1,%%v27,%%v19 \n\t" - - "vst %%v24, (64+128)(%%r1,%2) \n\t" - "vst %%v25, (80+128)(%%r1,%2) \n\t" - "vst %%v26, (96+128)(%%r1,%2) \n\t" - "vst %%v27, (112+128)(%%r1,%2) \n\t" - - "la %%r1,256(%%r1) \n\t" - "brctg %3,1b" - : - :"r"(n),"a"(x),"a"(y),"a"(alpha) - :"cc", "memory", "r1" ,"v0","v1","v16","v17","v18","v19", "v24","v25","v26","v27" - ); -} - - -#elif defined(Z13_D) -static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +#else +static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) { __asm__ volatile( #if defined(PREFETCH_INS) - "pfd 1, 0(%1) \n\t" - "pfd 2, 0(%2) \n\t" -#endif - "vlrepg %%v0 , 0(%3) \n\t" - "srlg %3,%0,5 \n\t" - "vlr %%v1,%%v0 \n\t" + "pfd 1, 0(%[x_tmp]) \n\t" + "pfd 2, 0(%[y_tmp]) \n\t" +#endif + "lgdr %%r0,%[alpha] \n\t" + "vlvgp %%v0,%%r0,%%r0 \n\t" + "srlg %%r0,%[n],5 \n\t" + "vlr %%v1,%%v0 \n\t" ".align 16 \n\t" "1: \n\t" #if defined(PREFETCH_INS) - "pfd 1, 256(%1) \n\t" - "pfd 2, 256(%2) \n\t" + "pfd 1, 256(%[x_tmp]) \n\t" + "pfd 2, 256(%[y_tmp]) \n\t" #endif - "vlm %%v16,%%v23, 0(%1) \n\t" - "vlm %%v24, %%v31, 0(%2) \n\t" - "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" - "vfmadb %%v17,%%v1,%%v17,%%v25 \n\t" - "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" - "vfmadb %%v19,%%v1,%%v19,%%v27 \n\t" - "vfmadb %%v20,%%v0,%%v20,%%v28 \n\t" - "vfmadb %%v21,%%v1,%%v21,%%v29 \n\t" - "vfmadb %%v22,%%v0,%%v22,%%v30 \n\t" - "vfmadb %%v23,%%v1,%%v23,%%v31 \n\t" - "vstm %%v16,%%v23, 0(%2) \n\t" - "vlm %%v24,%%v31, 128(%1) \n\t" - "vlm %%v16,%%v23, 128(%2) \n\t" - "vfmadb %%v24,%%v0,%%v24,%%v16 \n\t" - "vfmadb %%v25,%%v1,%%v25,%%v17 \n\t" - "vfmadb %%v26,%%v0,%%v26,%%v18 \n\t" - "vfmadb %%v27,%%v1,%%v27,%%v19 \n\t" - "vfmadb %%v28,%%v0,%%v28,%%v20 \n\t" - "vfmadb %%v29,%%v1,%%v29,%%v21 \n\t" - "vfmadb %%v30,%%v0,%%v30,%%v22 \n\t" - "vfmadb %%v31,%%v1,%%v31,%%v23 \n\t" - "la %1,256(%1) \n\t" - "vstm %%v24, %%v31, 128(%2) \n\t" - "la %2,256(%2) \n\t" - "brctg %3,1b" - : - :"r"(n),"a"(x),"a"(y),"a"(alpha) - :"cc", "memory", "v0","v1","v16","v17","v18","v19","v20","v21", + "vlm %%v16,%%v23, 0(%[x_tmp]) \n\t" + "vlm %%v24, %%v31, 0(%[y_tmp]) \n\t" + "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t" + "vfmadb %%v17,%%v1,%%v17,%%v25 \n\t" + "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t" + "vfmadb %%v19,%%v1,%%v19,%%v27 \n\t" + "vfmadb %%v20,%%v0,%%v20,%%v28 \n\t" + "vfmadb %%v21,%%v1,%%v21,%%v29 \n\t" + "vfmadb %%v22,%%v0,%%v22,%%v30 \n\t" + "vfmadb %%v23,%%v1,%%v23,%%v31 \n\t" + "vstm %%v16,%%v23, 0(%[y_tmp]) \n\t" + "vlm %%v24,%%v31, 128(%[x_tmp]) \n\t" + "vlm %%v16,%%v23, 128(%[y_tmp]) \n\t" + "vfmadb %%v24,%%v0,%%v24,%%v16 \n\t" + "vfmadb %%v25,%%v1,%%v25,%%v17 \n\t" + "vfmadb %%v26,%%v0,%%v26,%%v18 \n\t" + "vfmadb %%v27,%%v1,%%v27,%%v19 \n\t" + "vfmadb %%v28,%%v0,%%v28,%%v20 \n\t" + "vfmadb %%v29,%%v1,%%v29,%%v21 \n\t" + "vfmadb %%v30,%%v0,%%v30,%%v22 \n\t" + "vfmadb %%v31,%%v1,%%v31,%%v23 \n\t" + "la %[x_tmp],256(%[x_tmp]) \n\t" + "vstm %%v24, %%v31, 128(%[y_tmp]) \n\t" + "la %[y_tmp],256(%[y_tmp]) \n\t" + "brctg %%r0,1b" + : [mem_y] "+m" (*(double (*)[n])y), [x_tmp] "+&a"(x), [y_tmp] "+&a"(y) + : [mem_x] "m" (*(const double (*)[n])x), [n] "r"(n), [alpha] "f"(alpha) + :"cc", "r0", "v0","v1","v16","v17","v18","v19","v20","v21", "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); @@ -334,7 +131,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS BLASLONG n1 = n & -32; if ( n1 ) - daxpy_kernel_32(n1, x, y , &da ); + daxpy_kernel_32(n1, x, y , da ); i = n1; while(i < n) diff --git a/kernel/zarch/dcopy.c b/kernel/zarch/dcopy.c index 4a76e1e34..968f63e1b 100644 --- a/kernel/zarch/dcopy.c +++ b/kernel/zarch/dcopy.c @@ -30,83 +30,84 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(Z13mvc) -static void __attribute__ ((noinline)) dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { +static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__ volatile( - "pfd 1, 0(%1) \n\t" - "pfd 2, 0(%2) \n\t" - "srlg %%r0,%0,5 \n\t" + "pfd 1, 0(%[ptr_x]) \n\t" + "pfd 2, 0(%[ptr_y]) \n\t" + "srlg %[n_tmp],%[n_tmp],5 \n\t" ".align 16 \n\t" "1: \n\t" - "mvc 0(256,%2),0(%1) \n\t" - "la %1,256(%1) \n\t" - "la %2,256(%2) \n\t" - "brctg %%r0,1b" - : - : "r"(n), "a"(x), "a"(y) - : "cc", "memory","r0" + "mvc 0(256,%[ptr_y]),0(%[ptr_x]) \n\t" + "la %[ptr_x],256(%[ptr_x]) \n\t" + "la %[ptr_y],256(%[ptr_y]) \n\t" + "brctg %[n_tmp],1b" + : [mem_y] "=m" (*(double (*)[n])y), [n_tmp] "+&r"(n) + : [mem_x] "m" (*(const double (*)[n])x), + [ptr_x] "a"(x), [ptr_y] "a"(y) + : "cc" ); return; } #else -static void __attribute__ ((noinline)) dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { +static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__ volatile( - "pfd 1, 0(%1) \n\t" - "pfd 2, 0(%2) \n\t" - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" + "pfd 1, 0(%[ptr_x]) \n\t" + "pfd 2, 0(%[ptr_y]) \n\t" + "srlg %[n_tmp],%[n_tmp],5 \n\t" + "xgr %%r1,%%r1 \n\t" ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%%r1,%1) \n\t" - "pfd 2, 256(%%r1,%2) \n\t" + "1: \n\t" + "pfd 1, 256(%%r1,%[ptr_x]) \n\t" + "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vst %%v24, 0(%%r1,%2) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vst %%v25, 16(%%r1,%2) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vst %%v26, 32(%%r1,%2) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vst %%v27, 48(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" + "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" + "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" + "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" + "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" + "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" + "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" + "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" - "vl %%v24, 64(%%r1,%1) \n\t" - "vst %%v24, 64(%%r1,%2) \n\t" - "vl %%v25, 80(%%r1,%1) \n\t" - "vst %%v25, 80(%%r1,%2) \n\t" - "vl %%v26, 96(%%r1,%1) \n\t" - "vst %%v26, 96(%%r1,%2) \n\t" - "vl %%v27, 112(%%r1,%1) \n\t" - "vst %%v27, 112(%%r1,%2) \n\t" + "vl %%v24, 64(%%r1,%[ptr_x]) \n\t" + "vst %%v24, 64(%%r1,%[ptr_y]) \n\t" + "vl %%v25, 80(%%r1,%[ptr_x]) \n\t" + "vst %%v25, 80(%%r1,%[ptr_y]) \n\t" + "vl %%v26, 96(%%r1,%[ptr_x]) \n\t" + "vst %%v26, 96(%%r1,%[ptr_y]) \n\t" + "vl %%v27, 112(%%r1,%[ptr_x]) \n\t" + "vst %%v27, 112(%%r1,%[ptr_y]) \n\t" - "vl %%v24, 128(%%r1,%1) \n\t" - "vst %%v24, 128(%%r1,%2) \n\t" + "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" + "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vst %%v25, 144(%%r1,%2) \n\t" + "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" + "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vst %%v26, 160(%%r1,%2) \n\t" + "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" + "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vst %%v27, 176(%%r1,%2) \n\t" + "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" + "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vl %%v24, 192(%%r1,%1) \n\t" - "vst %%v24, 192(%%r1,%2) \n\t" - "vl %%v25, 208(%%r1,%1) \n\t" - "vst %%v25, 208(%%r1,%2) \n\t" - "vl %%v26, 224(%%r1,%1) \n\t" - "vst %%v26, 224(%%r1,%2) \n\t" - "vl %%v27, 240(%%r1,%1) \n\t" - "vst %%v27, 240(%%r1,%2) \n\t" - "la %%r1,256(%%r1) \n\t" - "brctg %%r0,1b" - : - : "r"(n), "a"(x), "a"(y) - : "cc", "memory","r0","r1", "v24","v25","v26","v27" + "vl %%v24, 192(%%r1,%[ptr_x]) \n\t" + "vst %%v24, 192(%%r1,%[ptr_y]) \n\t" + "vl %%v25, 208(%%r1,%[ptr_x]) \n\t" + "vst %%v25, 208(%%r1,%[ptr_y]) \n\t" + "vl %%v26, 224(%%r1,%[ptr_x]) \n\t" + "vst %%v26, 224(%%r1,%[ptr_y]) \n\t" + "vl %%v27, 240(%%r1,%[ptr_x]) \n\t" + "vst %%v27, 240(%%r1,%[ptr_y]) \n\t" + "la %%r1,256(%%r1) \n\t" + "brctg %[n_tmp],1b" + : [mem_y] "=m" (*(double (*)[n])y), [n_tmp] "+&r"(n) + : [mem_x] "m" (*(const double (*)[n])x), [ptr_x] "a"(x), [ptr_y] "a"(y) + : "cc", "r1", "v24","v25","v26","v27" ); return; diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c index f55746671..c70cbd00d 100644 --- a/kernel/zarch/ddot.c +++ b/kernel/zarch/ddot.c @@ -30,65 +30,67 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(Z13) -static FLOAT ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y) +static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { FLOAT dot; __asm__ volatile( - "pfd 1, 0(%2) \n\t" - "pfd 1, 0(%3) \n\t" + "pfd 1, 0(%[ptr_x_tmp]) \n\t" + "pfd 1, 0(%[ptr_y_tmp]) \n\t" "vzero %%v24 \n\t" "vzero %%v25 \n\t" "vzero %%v26 \n\t" "vzero %%v27 \n\t" - "srlg %1,%1,4 \n\t" + "srlg %[n_tmp],%[n_tmp],4 \n\t" "xgr %%r1,%%r1 \n\t" ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%%r1,%2) \n\t" - "pfd 1, 256(%%r1,%3) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" + "1: \n\t" + "pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t" + "pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t" + "vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v28, 0(%%r1,%3) \n\t" - "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" - "vl %%v29, 16(%%r1,%3) \n\t" - "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t" + "vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t" + "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" + "vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t" + "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t" - "vl %%v30, 32(%%r1,%3) \n\t" - "vfmadb %%v26,%%v18,%%v30,%%v26 \n\t" - "vl %%v31, 48(%%r1,%3) \n\t" - "vfmadb %%v27,%%v19,%%v31,%%v27 \n\t" + "vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t" + "vfmadb %%v26,%%v18,%%v30,%%v26 \n\t" + "vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t" + "vfmadb %%v27,%%v19,%%v31,%%v27 \n\t" - "vl %%v16, 64(%%r1,%2) \n\t" - "vl %%v17, 80(%%r1,%2) \n\t" - "vl %%v18, 96(%%r1,%2) \n\t" - "vl %%v19, 112(%%r1,%2) \n\t" + "vl %%v16, 64(%%r1 ,%[ptr_x_tmp]) \n\t" + "vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v19, 112(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v28, 64(%%r1,%3) \n\t" - "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" - "vl %%v29, 80(%%r1,%3) \n\t" - "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t" + "vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t" + "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" + "vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t" + "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t" - "vl %%v30, 96(%%r1,%3) \n\t" - "vfmadb %%v26,%%v18,%%v30,%%v26 \n\t" - "vl %%v31, 112(%%r1,%3) \n\t" - "vfmadb %%v27,%%v19,%%v31,%%v27 \n\t" + "vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t" + "vfmadb %%v26,%%v18,%%v30,%%v26 \n\t" + "vl %%v31, 112(%%r1,%[ptr_y_tmp]) \n\t" + "vfmadb %%v27,%%v19,%%v31,%%v27 \n\t" - "la %%r1,128(%%r1) \n\t" - "brctg %1,1b \n\t" - "vfadb %%v24,%%v25,%%v24 \n\t" - "vfadb %%v24,%%v26,%%v24 \n\t" - "vfadb %%v24,%%v27,%%v24 \n\t" - "vrepg %%v1,%%v24,1 \n\t" - "vfadb %%v1,%%v24,%%v1 \n\t" - "ldr %0, %%f1 \n\t" - : "=f"(dot) ,"+&r"(n) - : "a"(x),"a"(y) - :"cc" , "r1","v16", "v17","v18","v19","v20","v21","v22","v23", + "la %%r1,128(%%r1) \n\t" + "brctg %[n_tmp],1b \n\t" + "vfadb %%v24,%%v25,%%v24 \n\t" + "vfadb %%v24,%%v26,%%v24 \n\t" + "vfadb %%v24,%%v27,%%v24 \n\t" + "vrepg %%v1,%%v24,1 \n\t" + "vfadb %%v1,%%v24,%%v1 \n\t" + "ldr %[dot], %%f1 \n\t" + : [dot] "=f"(dot) ,[n_tmp] "+&r"(n) + : [mem_x] "m"( *(const double (*)[n])x), + [mem_y] "m"( *(const double (*)[n])y), + [ptr_x_tmp]"a"(x), [ptr_y_tmp] "a"(y) + :"cc" , "r1","f1","v16", "v17","v18","v19","v20","v21","v22","v23", "v24","v25","v26","v27","v28","v29","v30","v31" ); @@ -99,14 +101,14 @@ static FLOAT ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y) #else -static FLOAT ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y ) +static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y ) { BLASLONG register i = 0; FLOAT dot = 0.0; while(i < n) { - dot += y[i] * x[i] + dot += y[i] * x[i] + y[i+1] * x[i+1] + y[i+2] * x[i+2] + y[i+3] * x[i+3] @@ -114,8 +116,17 @@ static FLOAT ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y ) + y[i+5] * x[i+5] + y[i+6] * x[i+6] + y[i+7] * x[i+7] ; + dot += y[i+8] * x[i+8] + + y[i+9] * x[i+9] + + y[i+10] * x[i+10] + + y[i+11] * x[i+11] + + y[i+12] * x[i+12] + + y[i+13] * x[i+13] + + y[i+14] * x[i+14] + + y[i+15] * x[i+15] ; + - i+=8 ; + i+=16 ; } return dot; @@ -138,10 +149,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG n1 = n & -16; - if ( n1 ) - dot = ddot_kernel_8(n1, x, y ); + if ( n1 ){ + dot = ddot_kernel_16(n1, x, y ); + i = n1; + } - i = n1; + while(i < n) { diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index 1097dd0cc..bf29538c7 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -24,44 +24,41 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" - - - -static void __attribute__ ((noinline)) drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) +static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA) { __asm__ ( - "pfd 2, 0(%1) \n\t" - "pfd 2, 0(%2) \n\t" - - "vlrepg %%v0,0(%3) \n\t" - "vlrepg %%v1,0(%4) \n\t" - "srlg %%r0,%0,5 \n\t" + "pfd 2, 0(%[ptr_x]) \n\t" + "pfd 2, 0(%[ptr_y]) \n\t" + "lgdr %%r1,%[cos] \n\t" + "vlvgp %%v0,%%r1,%%r1 \n\t" + "lgdr %%r1,%[sin] \n\t" + "vlvgp %%v1,%%r1,%%r1 \n\t" + "srlg %[n_tmp],%[n_tmp],5 \n\t" "xgr %%r1,%%r1 \n\t" ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%1) \n\t" - "pfd 2, 256(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" + "1: \n\t" + "pfd 2, 256(%%r1,%[ptr_x]) \n\t" + "pfd 2, 256(%%r1,%[ptr_y]) \n\t" + "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" + "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" + "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" + "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" + "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" + "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" + "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ /* 2nd parts*/ "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ @@ -71,35 +68,33 @@ static void __attribute__ ((noinline)) drot_kernel_32(BLASLONG n, FLOAT *x, FLO "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%[ptr_x]) \n\t" + "vst %%v29, 16(%%r1,%[ptr_x]) \n\t" + "vst %%v30, 32(%%r1,%[ptr_x]) \n\t" + "vst %%v31, 48(%%r1,%[ptr_x]) \n\t" + "vst %%v20, 0(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 16(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 32(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 48(%%r1,%[ptr_y]) \n\t" + "vl %%v24, 64(%%r1,%[ptr_x]) \n\t" + "vl %%v25, 80(%%r1,%[ptr_x]) \n\t" + "vl %%v26, 96(%%r1,%[ptr_x]) \n\t" + "vl %%v27, 112(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 64(%%r1,%[ptr_y]) \n\t" + "vl %%v17, 80(%%r1,%[ptr_y]) \n\t" + "vl %%v18, 96(%%r1,%[ptr_y]) \n\t" + "vl %%v19, 112(%%r1,%[ptr_y]) \n\t" - - "vst %%v28, 0(%%r1,%1) \n\t" - "vst %%v29, 16(%%r1,%1) \n\t" - "vst %%v30, 32(%%r1,%1) \n\t" - "vst %%v31, 48(%%r1,%1) \n\t" - "vst %%v20, 0(%%r1,%2) \n\t" - "vst %%v21, 16(%%r1,%2) \n\t" - "vst %%v22, 32(%%r1,%2) \n\t" - "vst %%v23, 48(%%r1,%2) \n\t" - - "vl %%v24, 64(%%r1,%1) \n\t" - "vl %%v25, 80(%%r1,%1) \n\t" - "vl %%v26, 96(%%r1,%1) \n\t" - "vl %%v27, 112(%%r1,%1) \n\t" - "vl %%v16, 64(%%r1,%2) \n\t" - "vl %%v17, 80(%%r1,%2) \n\t" - "vl %%v18, 96(%%r1,%2) \n\t" - "vl %%v19, 112(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ /* 2nd parts*/ "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ @@ -109,35 +104,33 @@ static void __attribute__ ((noinline)) drot_kernel_32(BLASLONG n, FLOAT *x, FLO "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%[ptr_x]) \n\t" + "vst %%v29, 80(%%r1,%[ptr_x]) \n\t" + "vst %%v30, 96(%%r1,%[ptr_x]) \n\t" + "vst %%v31, 112(%%r1,%[ptr_x]) \n\t" + "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" + "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" + "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" + "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" + "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" + "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" + "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" + "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" - - "vst %%v28, 64(%%r1,%1) \n\t" - "vst %%v29, 80(%%r1,%1) \n\t" - "vst %%v30, 96(%%r1,%1) \n\t" - "vst %%v31, 112(%%r1,%1) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ /* 2nd parts*/ "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ @@ -147,35 +140,33 @@ static void __attribute__ ((noinline)) drot_kernel_32(BLASLONG n, FLOAT *x, FLO "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%[ptr_x]) \n\t" + "vst %%v29, 144(%%r1,%[ptr_x]) \n\t" + "vst %%v30, 160(%%r1,%[ptr_x]) \n\t" + "vst %%v31, 176(%%r1,%[ptr_x]) \n\t" + "vst %%v20, 128(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 144(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 160(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 176(%%r1,%[ptr_y]) \n\t" + "vl %%v24, 192(%%r1,%[ptr_x]) \n\t" + "vl %%v25, 208(%%r1,%[ptr_x]) \n\t" + "vl %%v26, 224(%%r1,%[ptr_x]) \n\t" + "vl %%v27, 240(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 192(%%r1,%[ptr_y]) \n\t" + "vl %%v17, 208(%%r1,%[ptr_y]) \n\t" + "vl %%v18, 224(%%r1,%[ptr_y]) \n\t" + "vl %%v19, 240(%%r1,%[ptr_y]) \n\t" - - "vst %%v28, 128(%%r1,%1) \n\t" - "vst %%v29, 144(%%r1,%1) \n\t" - "vst %%v30, 160(%%r1,%1) \n\t" - "vst %%v31, 176(%%r1,%1) \n\t" - "vst %%v20, 128(%%r1,%2) \n\t" - "vst %%v21, 144(%%r1,%2) \n\t" - "vst %%v22, 160(%%r1,%2) \n\t" - "vst %%v23, 176(%%r1,%2) \n\t" - - "vl %%v24, 192(%%r1,%1) \n\t" - "vl %%v25, 208(%%r1,%1) \n\t" - "vl %%v26, 224(%%r1,%1) \n\t" - "vl %%v27, 240(%%r1,%1) \n\t" - "vl %%v16, 192(%%r1,%2) \n\t" - "vl %%v17, 208(%%r1,%2) \n\t" - "vl %%v18, 224(%%r1,%2) \n\t" - "vl %%v19, 240(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ /* 2nd parts*/ "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ @@ -185,34 +176,29 @@ static void __attribute__ ((noinline)) drot_kernel_32(BLASLONG n, FLOAT *x, FLO "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - - - "vst %%v28, 192(%%r1,%1) \n\t" - "vst %%v29, 208(%%r1,%1) \n\t" - "vst %%v30, 224(%%r1,%1) \n\t" - "vst %%v31, 240(%%r1,%1) \n\t" - "vst %%v20, 192(%%r1,%2) \n\t" - "vst %%v21, 208(%%r1,%2) \n\t" - "vst %%v22, 224(%%r1,%2) \n\t" - "vst %%v23, 240(%%r1,%2) \n\t" - - - - "la %%r1,256(%%r1) \n\t" - "brctg %%r0,1b" - : - : "r"(n), "a"(x), "a"(y),"a"(c),"a"(s) - : "cc", "memory","r0","r1" ,"v0","v1","v16", + + "vst %%v28, 192(%%r1,%[ptr_x]) \n\t" + "vst %%v29, 208(%%r1,%[ptr_x]) \n\t" + "vst %%v30, 224(%%r1,%[ptr_x]) \n\t" + "vst %%v31, 240(%%r1,%[ptr_x]) \n\t" + "vst %%v20, 192(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 208(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 224(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 240(%%r1,%[ptr_y]) \n\t" + + "la %%r1,256(%%r1) \n\t" + "brctg %[n_tmp],1b" + : [mem_x] "+m" (*(double (*)[n])x), + [mem_y] "+m" (*(double (*)[n])y), + [n_tmp] "+&r"(n) + : [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA) + : "cc", "r1" ,"v0","v1","v16", "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return; } - - - int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) { BLASLONG i=0; @@ -228,10 +214,8 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT BLASLONG n1 = n & -32; if ( n1 > 0 ) { - FLOAT cosa,sina; - cosa=c; - sina=s; - drot_kernel_32(n1, x, y, &cosa, &sina); + + drot_kernel_32(n1, x, y, c, s); i=n1; } @@ -245,7 +229,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } - } else { @@ -267,4 +250,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } - diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index 846b9737c..f57034aef 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -28,78 +28,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(Z13) -static void __attribute__ ((noinline)) dscal_kernel_8( BLASLONG n, FLOAT da , FLOAT *x ) + +static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x ) { - - __asm__ ("pfd 2, 0(%1) \n\t" - "vrepg %%v0 , %%v0,0 \n\t" - "sllg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%1) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vfmdb %%v24,%%v24,%%v0 \n\t" - "vst %%v24, 0(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vfmdb %%v25,%%v25,%%v0 \n\t" - "vst %%v25, 16(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vfmdb %%v26,%%v26,%%v0 \n\t" - "vst %%v26, 32(%%r1,%1) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vfmdb %%v27,%%v27,%%v0 \n\t" - "vst %%v27, 48(%%r1,%1) \n\t" - "vl %%v24, 64(%%r1,%1) \n\t" - "vfmdb %%v24,%%v24,%%v0 \n\t" - "vst %%v24, 64(%%r1,%1) \n\t" - "vl %%v25, 80(%%r1,%1) \n\t" - "vfmdb %%v25,%%v25,%%v0 \n\t" - "vst %%v25, 80(%%r1,%1) \n\t" - "vl %%v26, 96(%%r1,%1) \n\t" - "vfmdb %%v26,%%v26,%%v0 \n\t" - "vst %%v26, 96(%%r1,%1) \n\t" - "vl %%v27, 112(%%r1,%1) \n\t" - "vfmdb %%v27,%%v27,%%v0 \n\t" - "vst %%v27, 112(%%r1,%1) \n\t" - "la %%r1,128(%%r1) \n\t" - "clgrjl %%r1,%%r0,1b \n\t" - : - :"r"(n),"a"(x),"f"(da) - :"cc" , "memory" ,"r0","r1","v0","v24","v25","v26","v27" + + /* faster than sequence of triples(vl vfmd vst) (tested OPENBLAS_LOOPS=10000) */ + __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" + "lgdr %%r0,%[alpha] \n\t" + "vlvgp %%v0,%%r0,%%r0 \n\t" + "vlr %%v1,%%v0 \n\t" + "sllg %%r0,%[n],3 \n\t" + "agr %%r0,%[x_ptr] \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 2, 256(%[x_ptr]) \n\t" + "vlm %%v16,%%v23, 0(%[x_ptr]) \n\t" + "vfmdb %%v16,%%v16,%%v0 \n\t" + "vfmdb %%v17,%%v17,%%v1 \n\t" + "vfmdb %%v18,%%v18,%%v0 \n\t" + "vfmdb %%v19,%%v19,%%v1 \n\t" + "vfmdb %%v20,%%v20,%%v0 \n\t" + "vfmdb %%v21,%%v21,%%v1 \n\t" + "vfmdb %%v22,%%v22,%%v0 \n\t" + "vfmdb %%v23,%%v23,%%v1 \n\t" + "vstm %%v16,%%v23, 0(%[x_ptr]) \n\t" + "vlm %%v24,%%v31,128(%[x_ptr]) \n\t" + "vfmdb %%v24,%%v24,%%v0 \n\t" + "vfmdb %%v25,%%v25,%%v1 \n\t" + "vfmdb %%v26,%%v26,%%v0 \n\t" + "vfmdb %%v27,%%v27,%%v1 \n\t" + "vfmdb %%v28,%%v28,%%v0 \n\t" + "vfmdb %%v29,%%v29,%%v1 \n\t" + "vfmdb %%v30,%%v30,%%v0 \n\t" + "vfmdb %%v31,%%v31,%%v1 \n\t" + "vstm %%v24,%%v31,128(%[x_ptr]) \n\t" + "la %[x_ptr], 256(%[x_ptr]) \n\t" + "clgrjl %[x_ptr],%%r0,1b \n\t" + : [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x) + : [n] "r"(n),[alpha] "f"(da) + :"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21", + "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); -} + } -static void __attribute__ ((noinline)) dscal_kernel_8_zero( BLASLONG n, FLOAT da , FLOAT *x ) +static void dscal_kernel_32_zero( BLASLONG n, FLOAT *x ) { - __asm__ ("pfd 2, 0(%1) \n\t" - "vzero %%v0 \n\t" - "sllg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%1) \n\t" - "vst %%v0, 0(%%r1,%1) \n\t" - "vst %%v0, 16(%%r1,%1) \n\t" - "vst %%v0, 32(%%r1,%1) \n\t" - "vst %%v0, 48(%%r1,%1) \n\t" - "vst %%v0, 64(%%r1,%1) \n\t" - "vst %%v0, 80(%%r1,%1) \n\t" - "vst %%v0, 96(%%r1,%1) \n\t" - "vst %%v0, 112(%%r1,%1) \n\t" - "la %%r1,128(%%r1) \n\t" - "clgrjl %%r1,%%r0,1b \n\t" - : - :"r"(n),"a"(x),"f"(da) - :"cc" , "memory" ,"r0","r1","v0" + __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" + "vzero %%v24 \n\t" + "sllg %%r0,%[n],3 \n\t" + "vzero %%v25 \n\t" + "agr %%r0,%[x_ptr] \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 2, 256(%[x_ptr]) \n\t" + "vst %%v24, 0(%[x_ptr]) \n\t" + "vst %%v25, 16(%[x_ptr]) \n\t" + "vst %%v24, 32(%[x_ptr]) \n\t" + "vst %%v25, 48(%[x_ptr]) \n\t" + "vst %%v24, 64(%[x_ptr]) \n\t" + "vst %%v25, 80(%[x_ptr]) \n\t" + "vst %%v24, 96(%[x_ptr]) \n\t" + "vst %%v25, 112(%[x_ptr]) \n\t" + "vst %%v24, 128(%[x_ptr]) \n\t" + "vst %%v25, 144(%[x_ptr]) \n\t" + "vst %%v24, 160(%[x_ptr]) \n\t" + "vst %%v25, 176(%[x_ptr]) \n\t" + "vst %%v24, 192(%[x_ptr]) \n\t" + "vst %%v25, 208(%[x_ptr]) \n\t" + "vst %%v24, 224(%[x_ptr]) \n\t" + "vst %%v25, 240(%[x_ptr]) \n\t" + "la %[x_ptr],256(%[x_ptr]) \n\t" + "clgrjl %[x_ptr],%%r0,1b \n\t" + : [mem] "=m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x) + : [n] "r"(n) + :"cc" , "r0", "v24" ,"v25" ); } -#endif + int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { @@ -114,11 +123,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { - BLASLONG n1 = n & -16; + BLASLONG n1 = n & -32; if ( n1 > 0 ) { - dscal_kernel_8_zero(n1 , da , x); + dscal_kernel_32_zero(n1 , x); j=n1; } @@ -133,10 +142,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { - BLASLONG n1 = n & -16; + BLASLONG n1 = n & -32; if ( n1 > 0 ) { - dscal_kernel_8(n1 , da , x); + dscal_kernel_32(n1 , da , x); j=n1; } while(j < n) diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index b98347870..686585bf0 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -29,299 +29,205 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#define Z13_SWAP_C 1 + #if defined(Z13_SWAP_A) -static void __attribute__ ((noinline)) dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__ volatile( - "pfd 1, 0(%1) \n\t" - "pfd 2, 0(%2) \n\t" - "srlg %%r0,%0,5 \n\t" + "pfd 1, 0(%[ptr_x]) \n\t" + "pfd 2, 0(%[ptr_y]) \n\t" + "srlg %[n_tmp],%[n_tmp],5 \n\t" "xgr %%r1,%%r1 \n\t" ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%1) \n\t" - "pfd 2, 256(%%r1,%2) \n\t" + "1: \n\t" + "pfd 2, 256(%%r1,%[ptr_x]) \n\t" + "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vst %%v24, 0(%%r1,%2) \n\t" - "vst %%v16, 0(%%r1,%1) \n\t" + "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" + "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" + "vst %%v16, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vst %%v25, 16(%%r1,%2) \n\t" - "vst %%v17, 16(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" + "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" + "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" + "vst %%v17, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vst %%v26, 32(%%r1,%2) \n\t" - "vst %%v18, 32(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" + "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" + "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" + "vst %%v18, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - "vst %%v27, 48(%%r1,%2) \n\t" - "vst %%v19, 48(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" + "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" + "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" + "vst %%v19, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v28, 64(%%r1,%1) \n\t" - "vl %%v20, 64(%%r1,%2) \n\t" - "vst %%v28, 64(%%r1,%2) \n\t" - "vst %%v20, 64(%%r1,%1) \n\t" + "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" + "vl %%v20, 64(%%r1,%[ptr_y]) \n\t" + "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" + "vst %%v20, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v29, 80(%%r1,%1) \n\t" - "vl %%v21, 80(%%r1,%2) \n\t" - "vst %%v29, 80(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%1) \n\t" + "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" + "vl %%v21, 80(%%r1,%[ptr_y]) \n\t" + "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v30, 96(%%r1,%1) \n\t" - "vl %%v22, 96(%%r1,%2) \n\t" - "vst %%v30, 96(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%1) \n\t" + "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" + "vl %%v22, 96(%%r1,%[ptr_y]) \n\t" + "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v31, 112(%%r1,%1) \n\t" - "vl %%v23, 112(%%r1,%2) \n\t" - "vst %%v31, 112(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%1) \n\t" + "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" + "vl %%v23, 112(%%r1,%[ptr_y]) \n\t" + "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v16, 128(%%r1,%2) \n\t" - "vst %%v24, 128(%%r1,%2) \n\t" - "vst %%v16, 128(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" + "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" + "vst %%v16, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vst %%v25, 144(%%r1,%2) \n\t" - "vst %%v17, 144(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" + "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" + "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" + "vst %%v17, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vst %%v26, 160(%%r1,%2) \n\t" - "vst %%v18, 160(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" + "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" + "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" + "vst %%v18, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - "vst %%v27, 176(%%r1,%2) \n\t" - "vst %%v19, 176(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" + "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" + "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" + "vst %%v19, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v28, 192(%%r1,%1) \n\t" - "vl %%v20, 192(%%r1,%2) \n\t" - "vst %%v28, 192(%%r1,%2) \n\t" - "vst %%v20, 192(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" + "vl %%v20, 192(%%r1,%[ptr_y]) \n\t" + "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" + "vst %%v20, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v29, 208(%%r1,%1) \n\t" - "vl %%v21, 208(%%r1,%2) \n\t" - "vst %%v29, 208(%%r1,%2) \n\t" - "vst %%v21, 208(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" + "vl %%v21, 208(%%r1,%[ptr_y]) \n\t" + "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v30, 224(%%r1,%1) \n\t" - "vl %%v22, 224(%%r1,%2) \n\t" - "vst %%v30, 224(%%r1,%2) \n\t" - "vst %%v22, 224(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" + "vl %%v22, 224(%%r1,%[ptr_y]) \n\t" + "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v31, 240(%%r1,%1) \n\t" - "vl %%v23, 240(%%r1,%2) \n\t" - "vst %%v31, 240(%%r1,%2) \n\t" - "vst %%v23, 240(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" + "vl %%v23, 240(%%r1,%[ptr_y]) \n\t" + "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 240(%%r1,%[ptr_x]) \n\t" - "la %%r1,256(%%r1) \n\t" - "brctg %%r0,1b" - : - : "r"(n), "a"(x), "a"(y) - : "cc", "memory" ,"r0","r1", "v16","v17","v18","v19","v20","v21","v22","v23" + "la %%r1,256(%%r1) \n\t" + "brctg %[n_tmp],1b" + : [mem_x] "+m" (*(double (*)[n])x), + [mem_y] "+m" (*(double (*)[n])y), + [n_tmp] "+&r"(n) + : [ptr_x] "a"(x), [ptr_y] "a"(y) + : "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23" ,"v24","v25","v26","v27","v28","v29","v30","v31" ); return; } -#elif defined(Z13_SWAP_B) +#else + static void __attribute__ ((noinline)) dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__ volatile( - "pfd 2, 0(%1) \n\t" - "pfd 2, 0(%2) \n\t" - "srlg %%r0,%0,5 \n\t" + "pfd 2, 0(%[ptr_x]) \n\t" + "pfd 2, 0(%[ptr_y]) \n\t" + "srlg %[n_tmp],%[n_tmp],5 \n\t" "xgr %%r1,%%r1 \n\t" ".align 16 \n\t" "1: \n\t" - "pfd 2, 256(%%r1,%1) \n\t" - "pfd 2, 256(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vl %%v28, 64(%%r1,%1) \n\t" - "vl %%v29, 80(%%r1,%1) \n\t" - "vl %%v30, 96(%%r1,%1) \n\t" - "vl %%v31, 112(%%r1,%1) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - "vl %%v20, 64(%%r1,%2) \n\t" - "vl %%v21, 80(%%r1,%2) \n\t" - "vl %%v22, 96(%%r1,%2) \n\t" - "vl %%v23, 112(%%r1,%2) \n\t" - - "vst %%v24, 0(%%r1,%2) \n\t" - "vst %%v25, 16(%%r1,%2) \n\t" - "vst %%v26, 32(%%r1,%2) \n\t" - "vst %%v27, 48(%%r1,%2) \n\t" - "vst %%v28, 64(%%r1,%2) \n\t" - "vst %%v29, 80(%%r1,%2) \n\t" - "vst %%v30, 96(%%r1,%2) \n\t" - "vst %%v31, 112(%%r1,%2)\n\t" - "vst %%v16, 0(%%r1,%1) \n\t" - "vst %%v17, 16(%%r1,%1) \n\t" - "vst %%v18, 32(%%r1,%1) \n\t" - "vst %%v19, 48(%%r1,%1) \n\t" - "vst %%v20, 64(%%r1,%1) \n\t" - "vst %%v21, 80(%%r1,%1) \n\t" - "vst %%v22, 96(%%r1,%1) \n\t" - "vst %%v23, 112(%%r1,%1)\n\t" + "pfd 2, 256(%%r1,%[ptr_x]) \n\t" + "pfd 2, 256(%%r1,%[ptr_y]) \n\t" + "vl %%v16, 0(%%r1,%[ptr_x]) \n\t" + "vl %%v17, 16(%%r1,%[ptr_x]) \n\t" + "vl %%v18, 32(%%r1,%[ptr_x]) \n\t" + "vl %%v19, 48(%%r1,%[ptr_x]) \n\t" + "vl %%v20, 64(%%r1,%[ptr_x]) \n\t" + "vl %%v21, 80(%%r1,%[ptr_x]) \n\t" + "vl %%v22, 96(%%r1,%[ptr_x]) \n\t" + "vl %%v23, 112(%%r1,%[ptr_x]) \n\t" + "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" + "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" + "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" + "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" + "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" + "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" + "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" + "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v28, 192(%%r1,%1) \n\t" - "vl %%v29, 208(%%r1,%1) \n\t" - "vl %%v30, 224(%%r1,%1) \n\t" - "vl %%v31, 240(%%r1,%1) \n\t" + "vl %%v0, 0(%%r1,%[ptr_y]) \n\t" + "vl %%v1, 16(%%r1,%[ptr_y]) \n\t" + "vl %%v2, 32(%%r1,%[ptr_y]) \n\t" + "vl %%v3, 48(%%r1,%[ptr_y]) \n\t" + "vl %%v4, 64(%%r1,%[ptr_y]) \n\t" + "vl %%v5, 80(%%r1,%[ptr_y]) \n\t" + "vl %%v6, 96(%%r1,%[ptr_y]) \n\t" + "vl %%v7, 112(%%r1,%[ptr_y]) \n\t" + "vst %%v0, 0(%%r1,%[ptr_x]) \n\t" + "vst %%v1, 16(%%r1,%[ptr_x]) \n\t" + "vst %%v2, 32(%%r1,%[ptr_x]) \n\t" + "vst %%v3, 48(%%r1,%[ptr_x]) \n\t" + "vst %%v4, 64(%%r1,%[ptr_x]) \n\t" + "vst %%v5, 80(%%r1,%[ptr_x]) \n\t" + "vst %%v6, 96(%%r1,%[ptr_x]) \n\t" + "vst %%v7, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - "vl %%v20, 192(%%r1,%2) \n\t" - "vl %%v21, 208(%%r1,%2) \n\t" - "vl %%v22, 224(%%r1,%2) \n\t" - "vl %%v23, 240(%%r1,%2) \n\t" + "vl %%v0, 128(%%r1,%[ptr_y]) \n\t" + "vl %%v1, 144(%%r1,%[ptr_y]) \n\t" + "vl %%v2, 160(%%r1,%[ptr_y]) \n\t" + "vl %%v3, 176(%%r1,%[ptr_y]) \n\t" + "vl %%v4, 192(%%r1,%[ptr_y]) \n\t" + "vl %%v5, 208(%%r1,%[ptr_y]) \n\t" + "vl %%v6, 224(%%r1,%[ptr_y]) \n\t" + "vl %%v7, 240(%%r1,%[ptr_y]) \n\t" + "vst %%v0, 128(%%r1,%[ptr_x]) \n\t" + "vst %%v1, 144(%%r1,%[ptr_x]) \n\t" + "vst %%v2, 160(%%r1,%[ptr_x]) \n\t" + "vst %%v3, 176(%%r1,%[ptr_x]) \n\t" + "vst %%v4, 192(%%r1,%[ptr_x]) \n\t" + "vst %%v5, 208(%%r1,%[ptr_x]) \n\t" + "vst %%v6, 224(%%r1,%[ptr_x]) \n\t" + "vst %%v7, 240(%%r1,%[ptr_x]) \n\t" - - "vst %%v24, 128(%%r1,%2) \n\t" - "vst %%v25, 144(%%r1,%2) \n\t" - "vst %%v26, 160(%%r1,%2) \n\t" - "vst %%v27, 176(%%r1,%2) \n\t" - "vst %%v28, 192(%%r1,%2) \n\t" - "vst %%v29, 208(%%r1,%2) \n\t" - "vst %%v30, 224(%%r1,%2) \n\t" - "vst %%v31, 240(%%r1,%2) \n\t" - - - "vst %%v16, 128(%%r1,%1) \n\t" - "vst %%v17, 144(%%r1,%1) \n\t" - "vst %%v18, 160(%%r1,%1) \n\t" - "vst %%v19, 176(%%r1,%1) \n\t" - "vst %%v20, 192(%%r1,%1) \n\t" - "vst %%v21, 208(%%r1,%1) \n\t" - "vst %%v22, 224(%%r1,%1) \n\t" - "vst %%v23, 240(%%r1,%1) \n\t" + "vst %%v16, 0(%%r1,%[ptr_y]) \n\t" + "vst %%v17, 16(%%r1,%[ptr_y]) \n\t" + "vst %%v18, 32(%%r1,%[ptr_y]) \n\t" + "vst %%v19, 48(%%r1,%[ptr_y]) \n\t" + "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" + "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" + "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" + "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" + "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" + "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" + "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" + "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" + "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - "la %%r1,256(%%r1) \n\t" - "brctg %%r0,1b" - : - : "r"(n), "a"(x), "a"(y) - : "cc", "memory","r0","r1", "v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - - ); - return; - -} - -#elif defined(Z13_SWAP_C) -static void __attribute__ ((noinline)) dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "pfd 2, 0(%1) \n\t" - "pfd 2, 0(%2) \n\t" - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%1) \n\t" - "pfd 2, 256(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v20, 64(%%r1,%1) \n\t" - "vl %%v21, 80(%%r1,%1) \n\t" - "vl %%v22, 96(%%r1,%1) \n\t" - "vl %%v23, 112(%%r1,%1) \n\t" - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v28, 192(%%r1,%1) \n\t" - "vl %%v29, 208(%%r1,%1) \n\t" - "vl %%v30, 224(%%r1,%1) \n\t" - "vl %%v31, 240(%%r1,%1) \n\t" - - - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "vl %%v4, 64(%%r1,%2) \n\t" - "vl %%v5, 80(%%r1,%2) \n\t" - "vl %%v6, 96(%%r1,%2) \n\t" - "vl %%v7, 112(%%r1,%2) \n\t" - "vst %%v0, 0(%%r1,%1) \n\t" - "vst %%v1, 16(%%r1,%1) \n\t" - "vst %%v2, 32(%%r1,%1) \n\t" - "vst %%v3, 48(%%r1,%1) \n\t" - "vst %%v4, 64(%%r1,%1) \n\t" - "vst %%v5, 80(%%r1,%1) \n\t" - "vst %%v6, 96(%%r1,%1) \n\t" - "vst %%v7, 112(%%r1,%1) \n\t" - - "vl %%v0, 128(%%r1,%2) \n\t" - "vl %%v1, 144(%%r1,%2) \n\t" - "vl %%v2, 160(%%r1,%2) \n\t" - "vl %%v3, 176(%%r1,%2) \n\t" - "vl %%v4, 192(%%r1,%2) \n\t" - "vl %%v5, 208(%%r1,%2) \n\t" - "vl %%v6, 224(%%r1,%2) \n\t" - "vl %%v7, 240(%%r1,%2) \n\t" - "vst %%v0, 128(%%r1,%1) \n\t" - "vst %%v1, 144(%%r1,%1) \n\t" - "vst %%v2, 160(%%r1,%1) \n\t" - "vst %%v3, 176(%%r1,%1) \n\t" - "vst %%v4, 192(%%r1,%1) \n\t" - "vst %%v5, 208(%%r1,%1) \n\t" - "vst %%v6, 224(%%r1,%1) \n\t" - "vst %%v7, 240(%%r1,%1) \n\t" - - "vst %%v16, 0(%%r1,%2) \n\t" - "vst %%v17, 16(%%r1,%2) \n\t" - "vst %%v18, 32(%%r1,%2) \n\t" - "vst %%v19, 48(%%r1,%2) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - "vst %%v24, 128(%%r1,%2) \n\t" - "vst %%v25, 144(%%r1,%2) \n\t" - "vst %%v26, 160(%%r1,%2) \n\t" - "vst %%v27, 176(%%r1,%2) \n\t" - "vst %%v28, 192(%%r1,%2) \n\t" - "vst %%v29, 208(%%r1,%2) \n\t" - "vst %%v30, 224(%%r1,%2) \n\t" - "vst %%v31, 240(%%r1,%2) \n\t" - - - "la %%r1,256(%%r1) \n\t" - "brctg %%r0,1b" - : - : "r"(n), "a"(x), "a"(y) - : "cc", "memory","r0","r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", + "la %%r1,256(%%r1) \n\t" + "brctg %[n_tmp],1b" + : [mem_x] "+m" (*(double (*)[n])x), + [mem_y] "+m" (*(double (*)[n])y), + [n_tmp] "+&r"(n) + : [ptr_x] "a"(x), [ptr_y] "a"(y) + : "cc", "memory","r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return; diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index b8a22aa02..b67091148 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -43,15 +43,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Warning: requirements n>0 and n % 32 == 0 * @param n * @param x pointer to the vector - * @param minf (out) maximum absolute value .( only for output ) + * @param maxf (out) maximum absolute value .( only for output ) * @return index */ static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { BLASLONG index; __asm__( - "pfd 1, 0(%4) \n\t" - "sllg %%r0,%3,3 \n\t" - "agr %%r0,%4 \n\t" + "pfd 1, 0(%[ptr_x]) \n\t" + "sllg %%r0,%[n],3 \n\t" + "agr %%r0,%[ptr_x] \n\t" "vleig %%v20,0,0 \n\t" "vleig %%v20,1,1 \n\t" "vleig %%v21,2,0 \n\t" @@ -61,13 +61,13 @@ static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { "vleig %%v23,6,0 \n\t" "vleig %%v23,7,1 \n\t" "vrepig %%v4,8 \n\t" - "vzero %%v5 \n\t" - "vzero %%v18 \n\t" - "vzero %%v19 \n\t" + "vzero %%v5 \n\t" + "vzero %%v18 \n\t" + "vzero %%v19 \n\t" ".align 16 \n\t" "1: \n\t" - "pfd 1, 256(%2 ) \n\t" - "vlm %%v24,%%v31, 0(%2 ) \n\t" + "pfd 1, 256(%[ptr_tmp] ) \n\t" + "vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t" "vflpdb %%v24, %%v24 \n\t" "vflpdb %%v25, %%v25 \n\t" "vflpdb %%v26, %%v26 \n\t" @@ -89,24 +89,24 @@ static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { "vsel %%v26,%%v23,%%v22,%%v17 \n\t" "vsel %%v27,%%v31,%%v30,%%v17 \n\t" - "vfchdb %%v28, %%v3,%%v0 \n\t" - "vfchdb %%v29,%%v27, %%v25 \n\t" - "vsel %%v1,%%v2,%%v1,%%v28 \n\t" - "vsel %%v0,%%v3,%%v0,%%v28 \n\t" - "vsel %%v24,%%v26,%%v24,%%v29 \n\t" - "vsel %%v25,%%v27,%%v25,%%v29 \n\t" - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "vag %%v24,%%v24,%%v4 \n\t" - "vfchdb %%v16,%%v25 , %%v0 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" - "vfchdb %%v17, %%v29,%%v18 \n\t" - "vsel %%v19,%%v28,%%v19,%%v17 \n\t" + "vfchdb %%v28, %%v3,%%v0 \n\t" + "vfchdb %%v29,%%v27, %%v25 \n\t" + "vsel %%v1,%%v2,%%v1,%%v28 \n\t" + "vsel %%v0,%%v3,%%v0,%%v28 \n\t" + "vsel %%v24,%%v26,%%v24,%%v29 \n\t" + "vsel %%v25,%%v27,%%v25,%%v29 \n\t" + "vag %%v1,%%v1,%%v5 \n\t" + "vag %%v24,%%v24,%%v5 \n\t" + "vag %%v24,%%v24,%%v4 \n\t" + "vfchdb %%v16,%%v25 , %%v0 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vsel %%v29,%%v25,%%v0,%%v16 \n\t" + "vsel %%v28,%%v24,%%v1,%%v16 \n\t" + "vfchdb %%v17, %%v29,%%v18 \n\t" + "vsel %%v19,%%v28,%%v19,%%v17 \n\t" "vsel %%v18,%%v29,%%v18,%%v17 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vlm %%v24,%%v31,128(%2 ) \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t" "vflpdb %%v24, %%v24 \n\t" "vflpdb %%v25, %%v25 \n\t" "vflpdb %%v26, %%v26 \n\t" @@ -134,37 +134,38 @@ static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { "vsel %%v0,%%v3,%%v0,%%v28 \n\t" "vsel %%v24,%%v26,%%v24,%%v29 \n\t" "vsel %%v25,%%v27,%%v25,%%v29 \n\t" - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "la %2,256(%2) \n\t" - "vag %%v24,%%v24,%%v4 \n\t" + "vag %%v1,%%v1,%%v5 \n\t" + "vag %%v24,%%v24,%%v5 \n\t" + "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" + "vag %%v24,%%v24,%%v4 \n\t" "vfchdb %%v16,%%v25 , %%v0 \n\t" "vag %%v5,%%v5,%%v4 \n\t" - "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" + "vsel %%v29,%%v25,%%v0,%%v16 \n\t" + "vsel %%v28,%%v24,%%v1,%%v16 \n\t" "vfchdb %%v17, %%v29,%%v18 \n\t" "vsel %%v19,%%v28,%%v19,%%v17 \n\t" "vsel %%v18,%%v29,%%v18,%%v17 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "clgrjl %2,%%r0,1b \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "clgrjl %[ptr_tmp],%%r0,1b \n\t" - "vrepg %%v26,%%v18,1 \n\t" - "vrepg %%v5,%%v19,1 \n\t" - "wfcdb %%v26,%%v18 \n\t" + "vrepg %%v26,%%v18,1 \n\t" + "vrepg %%v5,%%v19,1 \n\t" + "wfcdb %%v26,%%v18 \n\t" "jne 2f \n\t" - "vsteg %%v18,%1,0 \n\t" - "vmnlg %%v1,%%v5,%%v19 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "j 3f \n\t" - "2: \n\t" + "vsteg %%v18,%[maxf],0 \n\t" + "vmnlg %%v1,%%v5,%%v19 \n\t" + "j 3f \n\t" + + "2: \n\t" "wfchdb %%v16,%%v26,%%v18 \n\t" "vsel %%v1,%%v5,%%v19,%%v16 \n\t" "vsel %%v0,%%v26,%%v18,%%v16 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "std %%f0,%1 \n\t" - "3: " - : "=r"(index) ,"=m"(*maxf) , "+&a"(x) - : "r"(n), "2"(x) + "std %%f0,%[maxf] \n\t" + + "3: \n\t" + "vlgvg %[index],%%v1,0 \n\t" + : [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x) + : [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x) : "cc", "r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 555864af0..8a7ff1659 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -48,9 +48,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { BLASLONG index; __asm__( - "pfd 1, 0(%4) \n\t" - "sllg %%r0,%3,3 \n\t" - "agr %%r0,%4 \n\t" + "pfd 1, 0(%[ptr_x]) \n\t" + "sllg %%r0,%[n],3 \n\t" + "agr %%r0,%[ptr_x] \n\t" "vleig %%v20,0,0 \n\t" "vleig %%v20,1,1 \n\t" "vleig %%v21,2,0 \n\t" @@ -60,14 +60,14 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "vleig %%v23,6,0 \n\t" "vleig %%v23,7,1 \n\t" "vrepig %%v4,8 \n\t" - "vlrepg %%v18,0(%4) \n\t" - "vzero %%v5 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vzero %%v19 \n\t" + "vlrepg %%v18,0(%[ptr_x]) \n\t" + "vzero %%v5 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vzero %%v19 \n\t" ".align 16 \n\t" "1: \n\t" - "pfd 1, 256(%2 ) \n\t" - "vlm %%v24,%%v31, 0(%2 ) \n\t" + "pfd 1, 256(%[ptr_tmp] ) \n\t" + "vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t" "vflpdb %%v24, %%v24 \n\t" "vflpdb %%v25, %%v25 \n\t" @@ -99,22 +99,22 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "vsel %%v24,%%v26,%%v24,%%v29 \n\t" "vsel %%v25,%%v27,%%v25,%%v29 \n\t" - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "vag %%v24,%%v24,%%v4 \n\t" + "vag %%v1,%%v1,%%v5 \n\t" + "vag %%v24,%%v24,%%v5 \n\t" + "vag %%v24,%%v24,%%v4 \n\t" - "vfchdb %%v16, %%v0,%%v25 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" + "vfchdb %%v16, %%v0,%%v25 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" + "vsel %%v28,%%v24,%%v1,%%v16 \n\t" - "vfchdb %%v17,%%v18, %%v29 \n\t" - "vsel %%v19,%%v28,%%v19,%%v17 \n\t" + "vfchdb %%v17,%%v18, %%v29 \n\t" + "vsel %%v19,%%v28,%%v19,%%v17 \n\t" "vsel %%v18,%%v29,%%v18,%%v17 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" - "vlm %%v24,%%v31,128(%2 ) \n\t" + "vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t" "vflpdb %%v24, %%v24 \n\t" "vflpdb %%v25, %%v25 \n\t" "vflpdb %%v26, %%v26 \n\t" @@ -147,7 +147,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "vag %%v1,%%v1,%%v5 \n\t" "vag %%v24,%%v24,%%v5 \n\t" - "la %2,256(%2) \n\t" + "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" "vag %%v24,%%v24,%%v4 \n\t" "vfchdb %%v16, %%v0,%%v25 \n\t" @@ -161,27 +161,28 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "vag %%v5,%%v5,%%v4 \n\t" - "clgrjl %2,%%r0,1b \n\t" + "clgrjl %[ptr_tmp],%%r0,1b \n\t" "vrepg %%v26,%%v18,1 \n\t" "vrepg %%v5,%%v19,1 \n\t" "wfcdb %%v26,%%v18 \n\t" "jne 2f \n\t" - "vsteg %%v18,%1,0 \n\t" - "vmnlg %%v1,%%v5,%%v19 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "j 3f \n\t" - "2: \n\t" + "vsteg %%v18,%[minf],0 \n\t" + "vmnlg %%v1,%%v5,%%v19 \n\t" + "j 3f \n\t" + + "2: \n\t" "wfchdb %%v16,%%v18 ,%%v26 \n\t " "vsel %%v1,%%v5,%%v19,%%v16 \n\t" "vsel %%v0,%%v26,%%v18,%%v16 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "std %%f0,%1 \n\t" - "3:" + "std %%f0,%[minf] \n\t" - : "+r"(index) ,"=m"(*minf),"+&a"(x) - : "r"(n), "2"(x) + "3: \n\t" + "vlgvg %[index],%%v1,0 \n\t" + + : [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x) + : [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x) : "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 53b040083..216c3414a 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -37,16 +37,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /** * Find maximum index - * Warning: requirements n>0 and n % 8 == 0 + * Warning: requirements n>0 and n % 16 == 0 * @param n * @param x pointer to the vector - * @param minf (out) maximum absolute value .( only for output ) + * @param maxf (out) maximum absolute value .( only for output ) * @return index */ -static BLASLONG ziamax_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { +static BLASLONG ziamax_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { BLASLONG index; __asm__( - "pfd 1, 0(%4) \n\t" + "pfd 1, 0(%[ptr_x]) \n\t" "vleig %%v16,0,0 \n\t" "vleig %%v16,1,1 \n\t" "vleig %%v17,2,0 \n\t" @@ -65,32 +65,32 @@ static BLASLONG ziamax_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { "vleig %%v23,15,1 \n\t" - "sllg %%r0,%3,4 \n\t" - "agr %%r0,%4 \n\t" - "vzero %%v6 \n\t" - "vzero %%v7 \n\t" - "vrepig %%v4,16 \n\t" - "vzero %%v5 \n\t" + "sllg %%r0,%[n],4 \n\t" + "agr %%r0,%[ptr_x] \n\t" + "vzero %%v6 \n\t" + "vzero %%v7 \n\t" + "vrepig %%v4,16 \n\t" + "vzero %%v5 \n\t" ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%2 ) \n\t" + "1: \n\t" + "pfd 1, 256(%[ptr_tmp] ) \n\t" - "vleg %%v24 , 0( %2),0 \n\t" - "vleg %%v25 , 8( %2),0 \n\t" - "vleg %%v24 , 16( %2),1 \n\t" - "vleg %%v25 , 24( %2),1 \n\t" - "vleg %%v26 , 32( %2),0 \n\t" - "vleg %%v27 , 40( %2),0 \n\t" - "vleg %%v26 , 48( %2),1 \n\t" - "vleg %%v27 , 56( %2),1 \n\t" - "vleg %%v28 , 64( %2),0 \n\t" - "vleg %%v29 , 72( %2),0 \n\t" - "vleg %%v28 , 80( %2),1 \n\t" - "vleg %%v29 , 88( %2),1 \n\t" - "vleg %%v30 , 96( %2),0 \n\t" - "vleg %%v31 ,104( %2),0 \n\t" - "vleg %%v30 ,112( %2),1 \n\t" - "vleg %%v31 ,120( %2),1 \n\t" + "vleg %%v24 , 0(%[ptr_tmp]),0 \n\t" + "vleg %%v25 , 8(%[ptr_tmp]),0 \n\t" + "vleg %%v24 , 16(%[ptr_tmp]),1 \n\t" + "vleg %%v25 , 24(%[ptr_tmp]),1 \n\t" + "vleg %%v26 , 32(%[ptr_tmp]),0 \n\t" + "vleg %%v27 , 40(%[ptr_tmp]),0 \n\t" + "vleg %%v26 , 48(%[ptr_tmp]),1 \n\t" + "vleg %%v27 , 56(%[ptr_tmp]),1 \n\t" + "vleg %%v28 , 64(%[ptr_tmp]),0 \n\t" + "vleg %%v29 , 72(%[ptr_tmp]),0 \n\t" + "vleg %%v28 , 80(%[ptr_tmp]),1 \n\t" + "vleg %%v29 , 88(%[ptr_tmp]),1 \n\t" + "vleg %%v30 , 96(%[ptr_tmp]),0 \n\t" + "vleg %%v31 ,104(%[ptr_tmp]),0 \n\t" + "vleg %%v30 ,112(%[ptr_tmp]),1 \n\t" + "vleg %%v31 ,120(%[ptr_tmp]),1 \n\t" "vflpdb %%v24, %%v24 \n\t" "vflpdb %%v25, %%v25 \n\t" "vflpdb %%v26, %%v26 \n\t" @@ -100,28 +100,28 @@ static BLASLONG ziamax_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { "vflpdb %%v30, %%v30 \n\t" "vflpdb %%v31, %%v31 \n\t" - "vfadb %%v0,%%v24,%%v25 \n\t" - "vfadb %%v1,%%v26,%%v27 \n\t" - "vfadb %%v2,%%v28,%%v29 \n\t" - "vfadb %%v3,%%v30,%%v31 \n\t" + "vfadb %%v0,%%v24,%%v25 \n\t" + "vfadb %%v1,%%v26,%%v27 \n\t" + "vfadb %%v2,%%v28,%%v29 \n\t" + "vfadb %%v3,%%v30,%%v31 \n\t" - "vleg %%v24 , 128( %2),0 \n\t" - "vleg %%v25 , 136( %2),0 \n\t" - "vleg %%v24 , 144( %2),1 \n\t" - "vleg %%v25 , 152( %2),1 \n\t" - "vleg %%v26 , 160( %2),0 \n\t" - "vleg %%v27 , 168( %2),0 \n\t" - "vleg %%v26 , 176( %2),1 \n\t" - "vleg %%v27 , 184( %2),1 \n\t" - "vleg %%v28 , 192( %2),0 \n\t" - "vleg %%v29 , 200( %2),0 \n\t" - "vleg %%v28 , 208( %2),1 \n\t" - "vleg %%v29 , 216( %2),1 \n\t" - "vleg %%v30 , 224( %2),0 \n\t" - "vleg %%v31 , 232( %2),0 \n\t" - "vleg %%v30 , 240( %2),1 \n\t" - "vleg %%v31 , 248( %2),1 \n\t" + "vleg %%v24 , 128(%[ptr_tmp]),0 \n\t" + "vleg %%v25 , 136(%[ptr_tmp]),0 \n\t" + "vleg %%v24 , 144(%[ptr_tmp]),1 \n\t" + "vleg %%v25 , 152(%[ptr_tmp]),1 \n\t" + "vleg %%v26 , 160(%[ptr_tmp]),0 \n\t" + "vleg %%v27 , 168(%[ptr_tmp]),0 \n\t" + "vleg %%v26 , 176(%[ptr_tmp]),1 \n\t" + "vleg %%v27 , 184(%[ptr_tmp]),1 \n\t" + "vleg %%v28 , 192(%[ptr_tmp]),0 \n\t" + "vleg %%v29 , 200(%[ptr_tmp]),0 \n\t" + "vleg %%v28 , 208(%[ptr_tmp]),1 \n\t" + "vleg %%v29 , 216(%[ptr_tmp]),1 \n\t" + "vleg %%v30 , 224(%[ptr_tmp]),0 \n\t" + "vleg %%v31 , 232(%[ptr_tmp]),0 \n\t" + "vleg %%v30 , 240(%[ptr_tmp]),1 \n\t" + "vleg %%v31 , 248(%[ptr_tmp]),1 \n\t" "vflpdb %%v24, %%v24 \n\t" "vflpdb %%v25, %%v25 \n\t" "vflpdb %%v26, %%v26 \n\t" @@ -131,70 +131,70 @@ static BLASLONG ziamax_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { "vflpdb %%v30, %%v30 \n\t" "vflpdb %%v31, %%v31 \n\t" - "vfadb %%v24,%%v24,%%v25 \n\t" - "vfadb %%v26,%%v26,%%v27 \n\t" - "vfadb %%v28,%%v28,%%v29 \n\t" - "vfadb %%v30,%%v30,%%v31 \n\t" + "vfadb %%v24,%%v24,%%v25 \n\t" + "vfadb %%v26,%%v26,%%v27 \n\t" + "vfadb %%v28,%%v28,%%v29 \n\t" + "vfadb %%v30,%%v30,%%v31 \n\t" "vfchdb %%v25,%%v1,%%v0 \n\t" - "vsel %%v29,%%v17,%%v16,%%v25 \n\t" - "vsel %%v31,%%v1,%%v0,%%v25 \n\t" + "vsel %%v29,%%v17,%%v16,%%v25 \n\t" + "vsel %%v31,%%v1,%%v0,%%v25 \n\t" "vfchdb %%v27,%%v3,%%v2 \n\t " - "vsel %%v0,%%v19,%%v18,%%v27 \n\t" - "vsel %%v1,%%v3,%%v2,%%v27 \n\t" + "vsel %%v0,%%v19,%%v18,%%v27 \n\t" + "vsel %%v1,%%v3,%%v2,%%v27 \n\t" - "vfchdb %%v25,%%v26,%%v24 \n\t " - "vsel %%v2,%%v21,%%v20,%%v25 \n\t" - "vsel %%v3,%%v26,%%v24,%%v25 \n\t" + "vfchdb %%v25,%%v26,%%v24 \n\t" + "vsel %%v2,%%v21,%%v20,%%v25 \n\t" + "vsel %%v3,%%v26,%%v24,%%v25 \n\t" - "vfchdb %%v27,%%v30,%%v28 \n\t " - "vsel %%v25,%%v23,%%v22,%%v27 \n\t" - "vsel %%v27,%%v30,%%v28,%%v27 \n\t" + "vfchdb %%v27,%%v30,%%v28 \n\t" + "vsel %%v25,%%v23,%%v22,%%v27 \n\t" + "vsel %%v27,%%v30,%%v28,%%v27 \n\t" - "vfchdb %%v24, %%v1,%%v31 \n\t" - "vsel %%v26,%%v0,%%v29,%%v24 \n\t" - "vsel %%v28,%%v1,%%v31,%%v24 \n\t" + "vfchdb %%v24, %%v1,%%v31 \n\t" + "vsel %%v26,%%v0,%%v29,%%v24 \n\t" + "vsel %%v28,%%v1,%%v31,%%v24 \n\t" - "vfchdb %%v30, %%v27,%%v3 \n\t" - "vsel %%v29,%%v25,%%v2,%%v30 \n\t" - "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" + "vfchdb %%v30, %%v27,%%v3 \n\t" + "vsel %%v29,%%v25,%%v2,%%v30 \n\t" + "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" - "la %2,256(%2) \n\t" + "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - "vfchdb %%v0, %%v31,%%v28 \n\t" - "vsel %%v25,%%v29,%%v26,%%v0 \n\t" - "vsel %%v27,%%v31,%%v28,%%v0 \n\t" + "vfchdb %%v0, %%v31,%%v28 \n\t" + "vsel %%v25,%%v29,%%v26,%%v0 \n\t" + "vsel %%v27,%%v31,%%v28,%%v0 \n\t" - "vag %%v25,%%v25,%%v5 \n\t" + "vag %%v25,%%v25,%%v5 \n\t" //cmp with previous - "vfchdb %%v30, %%v27,%%v6 \n\t" - "vsel %%v7,%%v25,%%v7,%%v30 \n\t" - "vsel %%v6,%%v27,%%v6,%%v30 \n\t" + "vfchdb %%v30, %%v27,%%v6 \n\t" + "vsel %%v7,%%v25,%%v7,%%v30 \n\t" + "vsel %%v6,%%v27,%%v6,%%v30 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" - "clgrjl %2,%%r0,1b \n\t" + "clgrjl %[ptr_tmp],%%r0,1b \n\t" //xtract index - "vrepg %%v26,%%v6,1 \n\t" - "vrepg %%v5,%%v7,1 \n\t" + "vrepg %%v26,%%v6,1 \n\t" + "vrepg %%v5,%%v7,1 \n\t" "wfcdb %%v26,%%v6 \n\t" - "jne 2f \n\t" - "vsteg %%v6,%1,0 \n\t" + "jne 2f \n\t" + "vsteg %%v6,%[maxf],0 \n\t" "vmnlg %%v1,%%v5,%%v7 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "j 3 \n\t" - "2: \n\t" - "wfchdb %%v16,%%v26,%%v6 \n\t" - "vsel %%v1,%%v5,%%v7,%%v16 \n\t" - "vsel %%v0,%%v26,%%v6,%%v16 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "std %%f0,%1 \n\t" - "3: \n\t" - : "=r"(index),"=m"(*maxf),"+&a"(x) - : "r"(n), "2"(x) + "vlgvg %[index],%%v1,0 \n\t" + "j 3 \n\t" + "2: \n\t" + "wfchdb %%v16,%%v26,%%v6 \n\t" + "vsel %%v1,%%v5,%%v7,%%v16 \n\t" + "vsel %%v0,%%v26,%%v6,%%v16 \n\t" + "vlgvg %[index],%%v1,0 \n\t" + "std %%f0,%[maxf] \n\t" + "3: \n\t" + : [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x) + : [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x) : "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" @@ -220,12 +220,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (inc_x == 1) { - BLASLONG n1 = n & -8; + BLASLONG n1 = n & -16; if (n1 > 0) { - max = ziamax_kernel_8_TUNED(n1, x, &maxf); - + max = ziamax_kernel_16_TUNED(n1, x, &maxf); i = n1; + ix = n1 << 1; } while(i < n) diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 77ecf6724..9b2a653a7 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -35,16 +35,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /** * Find minimum index - * Warning: requirements n>0 and n % 8 == 0 + * Warning: requirements n>0 and n % 16 == 0 * @param n * @param x pointer to the vector * @param minf (out) minimum absolute value .( only for output ) * @return minimum index */ -static BLASLONG ziamin_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { +static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { BLASLONG index ; __asm__( - "pfd 1, 0(%4) \n\t" + "pfd 1, 0(%[ptr_x]) \n\t" "vleig %%v16,0,0 \n\t" "vleig %%v16,1,1 \n\t" "vleig %%v17,2,0 \n\t" @@ -61,143 +61,143 @@ static BLASLONG ziamin_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "vleig %%v22,13,1 \n\t" "vleig %%v23,14,0 \n\t" "vleig %%v23,15,1 \n\t" - "ld %%f6,0(%4) \n\t" - "lpdbr %%f6,%%f6 \n\t" - "ld %%f7,8(%4) \n\t" - "lpdbr %%f7,%%f7 \n\t" - "adbr %%f6,%%f7 \n\t" - "sllg %%r0,%3,4 \n\t" - "agr %%r0,%4 \n\t" - "vrepg %%v6,%%v6,0 \n\t" - "vzero %%v7 \n\t" - "vrepig %%v4,16 \n\t" - "vzero %%v5 \n\t" + "ld %%f6,0(%[ptr_x]) \n\t" + "lpdbr %%f6,%%f6 \n\t" + "ld %%f7,8(%[ptr_x]) \n\t" + "lpdbr %%f7,%%f7 \n\t" + "adbr %%f6,%%f7 \n\t" + "sllg %%r0,%[n],4 \n\t" + "agr %%r0,%[ptr_x] \n\t" + "vrepg %%v6,%%v6,0 \n\t" + "vzero %%v7 \n\t" + "vrepig %%v4,16 \n\t" + "vzero %%v5 \n\t" ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%2 ) \n\t" + "1: \n\t" + "pfd 1, 256(%[ptr_tmp] ) \n\t" - "vleg %%v24 , 0( %2),0 \n\t" - "vleg %%v25 , 8( %2),0 \n\t" - "vleg %%v24 , 16( %2),1 \n\t" - "vleg %%v25 , 24( %2),1 \n\t" - "vleg %%v26 , 32( %2),0 \n\t" - "vleg %%v27 , 40( %2),0 \n\t" - "vleg %%v26 , 48( %2),1 \n\t" - "vleg %%v27 , 56( %2),1 \n\t" - "vleg %%v28 , 64( %2),0 \n\t" - "vleg %%v29 , 72( %2),0 \n\t" - "vleg %%v28 , 80( %2),1 \n\t" - "vleg %%v29 , 88( %2),1 \n\t" - "vleg %%v30 , 96( %2),0 \n\t" - "vleg %%v31 ,104( %2),0 \n\t" - "vleg %%v30 ,112( %2),1 \n\t" - "vleg %%v31 ,120( %2),1 \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" + "vleg %%v24 , 0(%[ptr_tmp]),0 \n\t" + "vleg %%v25 , 8(%[ptr_tmp]),0 \n\t" + "vleg %%v24 , 16(%[ptr_tmp]),1 \n\t" + "vleg %%v25 , 24(%[ptr_tmp]),1 \n\t" + "vleg %%v26 , 32(%[ptr_tmp]),0 \n\t" + "vleg %%v27 , 40(%[ptr_tmp]),0 \n\t" + "vleg %%v26 , 48(%[ptr_tmp]),1 \n\t" + "vleg %%v27 , 56(%[ptr_tmp]),1 \n\t" + "vleg %%v28 , 64(%[ptr_tmp]),0 \n\t" + "vleg %%v29 , 72(%[ptr_tmp]),0 \n\t" + "vleg %%v28 , 80(%[ptr_tmp]),1 \n\t" + "vleg %%v29 , 88(%[ptr_tmp]),1 \n\t" + "vleg %%v30 , 96(%[ptr_tmp]),0 \n\t" + "vleg %%v31 ,104(%[ptr_tmp]),0 \n\t" + "vleg %%v30 ,112(%[ptr_tmp]),1 \n\t" + "vleg %%v31 ,120(%[ptr_tmp]),1 \n\t" + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" - "vfadb %%v0,%%v24,%%v25 \n\t" - "vfadb %%v1,%%v26,%%v27 \n\t" - "vfadb %%v2,%%v28,%%v29 \n\t" - "vfadb %%v3,%%v30,%%v31 \n\t" + "vfadb %%v0,%%v24,%%v25 \n\t" + "vfadb %%v1,%%v26,%%v27 \n\t" + "vfadb %%v2,%%v28,%%v29 \n\t" + "vfadb %%v3,%%v30,%%v31 \n\t" - "vleg %%v24 ,128( %2),0 \n\t" - "vleg %%v25 ,136( %2),0 \n\t" - "vleg %%v24 ,144( %2),1 \n\t" - "vleg %%v25 ,152( %2),1 \n\t" - "vleg %%v26 ,160( %2),0 \n\t" - "vleg %%v27 ,168( %2),0 \n\t" - "vleg %%v26 ,176( %2),1 \n\t" - "vleg %%v27 ,184( %2),1 \n\t" - "vleg %%v28 ,192( %2),0 \n\t" - "vleg %%v29 ,200( %2),0 \n\t" - "vleg %%v28 ,208( %2),1 \n\t" - "vleg %%v29 ,216( %2),1 \n\t" - "vleg %%v30 ,224( %2),0 \n\t" - "vleg %%v31 ,232( %2),0 \n\t" - "vleg %%v30 ,240( %2),1 \n\t" - "vleg %%v31 ,248( %2),1 \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" + "vleg %%v24 ,128(%[ptr_tmp]),0 \n\t" + "vleg %%v25 ,136(%[ptr_tmp]),0 \n\t" + "vleg %%v24 ,144(%[ptr_tmp]),1 \n\t" + "vleg %%v25 ,152(%[ptr_tmp]),1 \n\t" + "vleg %%v26 ,160(%[ptr_tmp]),0 \n\t" + "vleg %%v27 ,168(%[ptr_tmp]),0 \n\t" + "vleg %%v26 ,176(%[ptr_tmp]),1 \n\t" + "vleg %%v27 ,184(%[ptr_tmp]),1 \n\t" + "vleg %%v28 ,192(%[ptr_tmp]),0 \n\t" + "vleg %%v29 ,200(%[ptr_tmp]),0 \n\t" + "vleg %%v28 ,208(%[ptr_tmp]),1 \n\t" + "vleg %%v29 ,216(%[ptr_tmp]),1 \n\t" + "vleg %%v30 ,224(%[ptr_tmp]),0 \n\t" + "vleg %%v31 ,232(%[ptr_tmp]),0 \n\t" + "vleg %%v30 ,240(%[ptr_tmp]),1 \n\t" + "vleg %%v31 ,248(%[ptr_tmp]),1 \n\t" + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" - "vfadb %%v24,%%v24,%%v25 \n\t" - "vfadb %%v26,%%v26,%%v27 \n\t" - "vfadb %%v28,%%v28,%%v29 \n\t" - "vfadb %%v30,%%v30,%%v31 \n\t" + "vfadb %%v24,%%v24,%%v25 \n\t" + "vfadb %%v26,%%v26,%%v27 \n\t" + "vfadb %%v28,%%v28,%%v29 \n\t" + "vfadb %%v30,%%v30,%%v31 \n\t" - "vfchdb %%v25,%%v0 ,%%v1 \n\t" - "vsel %%v29,%%v17,%%v16,%%v25 \n\t" - "vsel %%v31,%%v1,%%v0,%%v25 \n\t" + "vfchdb %%v25,%%v0 ,%%v1 \n\t" + "vsel %%v29,%%v17,%%v16,%%v25 \n\t" + "vsel %%v31,%%v1,%%v0,%%v25 \n\t" - "vfchdb %%v27,%%v2,%%v3 \n\t" - "vsel %%v0,%%v19,%%v18,%%v27 \n\t" - "vsel %%v1,%%v3,%%v2,%%v27 \n\t" + "vfchdb %%v27,%%v2,%%v3 \n\t" + "vsel %%v0,%%v19,%%v18,%%v27 \n\t" + "vsel %%v1,%%v3,%%v2,%%v27 \n\t" - "vfchdb %%v25,%%v24,%%v26 \n\t" - "vsel %%v2,%%v21,%%v20,%%v25 \n\t" - "vsel %%v3,%%v26,%%v24,%%v25 \n\t" + "vfchdb %%v25,%%v24,%%v26 \n\t" + "vsel %%v2,%%v21,%%v20,%%v25 \n\t" + "vsel %%v3,%%v26,%%v24,%%v25 \n\t" - "vfchdb %%v27,%%v28,%%v30 \n\t" - "vsel %%v25,%%v23,%%v22,%%v27 \n\t" - "vsel %%v27,%%v30,%%v28,%%v27 \n\t" + "vfchdb %%v27,%%v28,%%v30 \n\t" + "vsel %%v25,%%v23,%%v22,%%v27 \n\t" + "vsel %%v27,%%v30,%%v28,%%v27 \n\t" - "vfchdb %%v24,%%v31, %%v1 \n\t" - "vsel %%v26,%%v0,%%v29,%%v24 \n\t" - "vsel %%v28,%%v1,%%v31,%%v24 \n\t" + "vfchdb %%v24,%%v31, %%v1 \n\t" + "vsel %%v26,%%v0,%%v29,%%v24 \n\t" + "vsel %%v28,%%v1,%%v31,%%v24 \n\t" - "vfchdb %%v30,%%v3, %%v27 \n\t" - "vsel %%v29,%%v25,%%v2,%%v30 \n\t" - "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" + "vfchdb %%v30,%%v3, %%v27 \n\t" + "vsel %%v29,%%v25,%%v2,%%v30 \n\t" + "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" - "la %2,256(%2) \n\t" + "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - "vfchdb %%v0,%%v28, %%v31 \n\t" - "vsel %%v25,%%v29,%%v26,%%v0 \n\t" - "vsel %%v27,%%v31,%%v28,%%v0 \n\t" + "vfchdb %%v0,%%v28, %%v31 \n\t" + "vsel %%v25,%%v29,%%v26,%%v0 \n\t" + "vsel %%v27,%%v31,%%v28,%%v0 \n\t" - "vag %%v25,%%v25,%%v5 \n\t" + "vag %%v25,%%v25,%%v5 \n\t" //cmp with previous - "vfchdb %%v30,%%v6 , %%v27 \n\t" - "vsel %%v7,%%v25,%%v7,%%v30 \n\t" - "vsel %%v6,%%v27,%%v6,%%v30 \n\t" + "vfchdb %%v30,%%v6 , %%v27 \n\t" + "vsel %%v7,%%v25,%%v7,%%v30 \n\t" + "vsel %%v6,%%v27,%%v6,%%v30 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" - "clgrjl %2,%%r0,1b \n\t" + "clgrjl %[ptr_tmp],%%r0,1b \n\t" //xtract index - "vrepg %%v26,%%v6,1 \n\t" - "vrepg %%v5,%%v7,1 \n\t" - "wfcdb %%v26,%%v6 \n\t" - "jne 2f \n\t" - "vsteg %%v6,%1,0 \n\t" + "vrepg %%v26,%%v6,1 \n\t" + "vrepg %%v5,%%v7,1 \n\t" + "wfcdb %%v26,%%v6 \n\t" + "jne 2f \n\t" + "vsteg %%v6,%[minf],0 \n\t" "vmnlg %%v1,%%v5,%%v7 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "j 3f \n\t" + "vlgvg %[index],%%v1,0 \n\t" + "j 3f \n\t" "2: \n\t" - "wfchdb %%v16,%%v6 ,%%v26 \n\t" - "vsel %%v1,%%v5,%%v7,%%v16 \n\t" - "vsel %%v0,%%v26,%%v6,%%v16 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "std %%f0,%1 \n\t" + "wfchdb %%v16,%%v6 ,%%v26 \n\t" + "vsel %%v1,%%v5,%%v7,%%v16 \n\t" + "vsel %%v0,%%v26,%%v6,%%v16 \n\t" + "vlgvg %[index],%%v1,0 \n\t" + "std %%f0,%[minf] \n\t" "3: \n\t" - : "+r"(index) ,"=m"(*minf), "+&a"(x) - : "r"(n), "2"(x) + : [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x) + : [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x) : "cc","r0","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" @@ -224,12 +224,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (inc_x == 1) { - BLASLONG n1 = n & -8; + BLASLONG n1 = n & -16; if (n1 > 0) { - min = ziamin_kernel_8_TUNED(n1, x, &minf); - + min = ziamin_kernel_16_TUNED(n1, x, &minf); i = n1; + ix = n1 << 1; } else { //assign minf diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index e01f465b1..0fc5c9ecb 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -44,65 +44,65 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { FLOAT asum; __asm__ ( - "pfd 1, 0(%3) \n\t" - "sllg %%r0,%2,4 \n\t" - "agr %%r0,%3 \n\t" - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v22 \n\t" - "vzero %%v23 \n\t" + "pfd 1, 0(%[ptr_x]) \n\t" + "sllg %%r0,%[n],4 \n\t" + "agr %%r0,%[ptr_x] \n\t" + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v22 \n\t" + "vzero %%v23 \n\t" ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%1 ) \n\t" - "vlm %%v24,%%v31,0(%1) \n\t" + "1: \n\t" + "pfd 1, 256(%[ptr_tmp] ) \n\t" + "vlm %%v24,%%v31,0(%[ptr_tmp]) \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" - "vfadb %%v0,%%v0,%%v24 \n\t" - "vfadb %%v1,%%v1,%%v25 \n\t" - "vfadb %%v23,%%v23,%%v26 \n\t" - "vfadb %%v22,%%v22,%%v27 \n\t" - "vfadb %%v0,%%v0,%%v28 \n\t" - "vfadb %%v1,%%v1,%%v29 \n\t" - "vfadb %%v23,%%v23,%%v30 \n\t" - "vfadb %%v22,%%v22,%%v31 \n\t" + "vfadb %%v0,%%v0,%%v24 \n\t" + "vfadb %%v1,%%v1,%%v25 \n\t" + "vfadb %%v23,%%v23,%%v26 \n\t" + "vfadb %%v22,%%v22,%%v27 \n\t" + "vfadb %%v0,%%v0,%%v28 \n\t" + "vfadb %%v1,%%v1,%%v29 \n\t" + "vfadb %%v23,%%v23,%%v30 \n\t" + "vfadb %%v22,%%v22,%%v31 \n\t" - "vlm %%v24,%%v31, 128(%1 ) \n\t" + "vlm %%v24,%%v31, 128(%[ptr_tmp]) \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - "la %1,256(%1) \n\t" - "vfadb %%v0,%%v0,%%v24 \n\t" - "vfadb %%v1,%%v1,%%v25 \n\t" - "vfadb %%v23,%%v23,%%v26 \n\t" - "vfadb %%v22,%%v22,%%v27 \n\t" - "vfadb %%v0,%%v0,%%v28 \n\t" - "vfadb %%v1,%%v1,%%v29 \n\t" - "vfadb %%v23,%%v23,%%v30 \n\t" - "vfadb %%v22,%%v22,%%v31 \n\t" + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" + "vfadb %%v0,%%v0,%%v24 \n\t" + "vfadb %%v1,%%v1,%%v25 \n\t" + "vfadb %%v23,%%v23,%%v26 \n\t" + "vfadb %%v22,%%v22,%%v27 \n\t" + "vfadb %%v0,%%v0,%%v28 \n\t" + "vfadb %%v1,%%v1,%%v29 \n\t" + "vfadb %%v23,%%v23,%%v30 \n\t" + "vfadb %%v22,%%v22,%%v31 \n\t" - "clgrjl %1,%%r0,1b \n\t" - "vfadb %%v24,%%v0,%%v1 \n\t" - "vfadb %%v25,%%v23,%%v22 \n\t" - "vfadb %%v0,%%v25,%%v24 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %0 ,%%f0" - : "=f"(asum),"+&a"(x) - : "r"(n), "1"(x) + "clgrjl %[ptr_tmp],%%r0,1b \n\t" + "vfadb %%v24,%%v0,%%v1 \n\t" + "vfadb %%v25,%%v23,%%v22 \n\t" + "vfadb %%v0,%%v25,%%v24 \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "ldr %[asum] ,%%f0" + : [asum] "=f"(asum),[ptr_tmp] "+&a"(x) + : [mem] "m"( *(const double (*)[2*n])x ), [n] "r"(n), [ptr_x] "a"(x) : "cc", "r0","f0","f1","v0","v1","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return asum; diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 0cd6036b9..6cec47458 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -28,36 +28,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void __attribute__ ((noinline)) zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { - __asm__ ("pfd 1, 0(%1) \n\t" - "pfd 2, 0(%2) \n\t" - "vlrepg %%v28 , 0(%3) \n\t" - "vlrepg %%v29, 8(%3) \n\t" - "srlg %3,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" +static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) { + __asm__ ("pfd 1, 0(%[x_tmp]) \n\t" + "pfd 2, 0(%[y_tmp]) \n\t" + "lgdr %%r1,%[alpha_r] \n\t" + "vlvgp %%v28,%%r1,%%r1 \n\t" + "lgdr %%r1,%[alpha_i] \n\t" + "vlvgp %%v29,%%r1,%%r1 \n\t" + "sllg %[tmp],%[tmp],4 \n\t" + "xgr %%r1,%%r1 \n\t" + ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%%r1,%1) \n\t" - "pfd 2, 256(%%r1,%2) \n\t" - "vleg %%v16 , 0(%%r1,%2),0 \n\t" - "vleg %%v17 , 8(%%r1,%2),0 \n\t" - "vleg %%v16 , 16(%%r1,%2),1 \n\t" - "vleg %%v17 , 24(%%r1,%2),1 \n\t" - - "vleg %%v18 , 32(%%r1,%2),0 \n\t" - "vleg %%v19 , 40(%%r1,%2),0 \n\t" - "vleg %%v18 , 48(%%r1,%2),1 \n\t" - "vleg %%v19 , 56(%%r1,%2),1 \n\t" - - "vleg %%v24 , 0(%%r1,%1),0 \n\t" - "vleg %%v25 , 8(%%r1,%1),0 \n\t" - "vleg %%v24 , 16(%%r1,%1),1 \n\t" - "vleg %%v25 , 24(%%r1,%1),1 \n\t" - - "vleg %%v26 , 32(%%r1,%1),0 \n\t" - "vleg %%v27 , 40(%%r1,%1),0 \n\t" - "vleg %%v26 , 48(%%r1,%1),1 \n\t" - "vleg %%v27 , 56(%%r1,%1),1 \n\t" + "1: \n\t" + "pfd 1, 256(%%r1,%[x_tmp]) \n\t" + "pfd 2, 256(%%r1,%[y_tmp]) \n\t" + "vleg %%v16 , 0(%%r1,%[y_tmp]),0 \n\t" + "vleg %%v17 , 8(%%r1,%[y_tmp]),0 \n\t" + "vleg %%v16 , 16(%%r1,%[y_tmp]),1 \n\t" + "vleg %%v17 , 24(%%r1,%[y_tmp]),1 \n\t" + "vleg %%v18 , 32(%%r1,%[y_tmp]),0 \n\t" + "vleg %%v19 , 40(%%r1,%[y_tmp]),0 \n\t" + "vleg %%v18 , 48(%%r1,%[y_tmp]),1 \n\t" + "vleg %%v19 , 56(%%r1,%[y_tmp]),1 \n\t" + "vleg %%v24 , 0(%%r1,%[x_tmp]),0 \n\t" + "vleg %%v25 , 8(%%r1,%[x_tmp]),0 \n\t" + "vleg %%v24 , 16(%%r1,%[x_tmp]),1 \n\t" + "vleg %%v25 , 24(%%r1,%[x_tmp]),1 \n\t" + "vleg %%v26 , 32(%%r1,%[x_tmp]),0 \n\t" + "vleg %%v27 , 40(%%r1,%[x_tmp]),0 \n\t" + "vleg %%v26 , 48(%%r1,%[x_tmp]),1 \n\t" + "vleg %%v27 , 56(%%r1,%[x_tmp]),1 \n\t" #if !defined(CONJ) "vfmsdb %%v16, %%v25, %%v29,%%v16 \n\t" "vfmadb %%v17, %%v24, %%v29, %%v17 \n\t" @@ -79,35 +79,35 @@ static void __attribute__ ((noinline)) zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOA "vfmsdb %%v19, %%v26, %%v29, %%v19 \n\t" #endif - "vsteg %%v16 , 0(%%r1,%2),0 \n\t" - "vsteg %%v17 , 8(%%r1,%2),0 \n\t" - "vsteg %%v16 , 16(%%r1,%2),1 \n\t" - "vsteg %%v17 , 24(%%r1,%2),1 \n\t" + "vsteg %%v16 , 0(%%r1,%[y_tmp]),0 \n\t" + "vsteg %%v17 , 8(%%r1,%[y_tmp]),0 \n\t" + "vsteg %%v16 , 16(%%r1,%[y_tmp]),1 \n\t" + "vsteg %%v17 , 24(%%r1,%[y_tmp]),1 \n\t" - "vsteg %%v18 , 32(%%r1,%2),0 \n\t" - "vsteg %%v19 , 40(%%r1,%2),0 \n\t" - "vsteg %%v18 , 48(%%r1,%2),1 \n\t" - "vsteg %%v19 , 56(%%r1,%2),1 \n\t" + "vsteg %%v18 , 32(%%r1,%[y_tmp]),0 \n\t" + "vsteg %%v19 , 40(%%r1,%[y_tmp]),0 \n\t" + "vsteg %%v18 , 48(%%r1,%[y_tmp]),1 \n\t" + "vsteg %%v19 , 56(%%r1,%[y_tmp]),1 \n\t" - "vleg %%v20 , 64(%%r1,%2),0 \n\t" - "vleg %%v21 , 72(%%r1,%2),0 \n\t" - "vleg %%v20 , 80(%%r1,%2),1 \n\t" - "vleg %%v21 , 88(%%r1,%2),1 \n\t" + "vleg %%v20 , 64(%%r1,%[y_tmp]),0 \n\t" + "vleg %%v21 , 72(%%r1,%[y_tmp]),0 \n\t" + "vleg %%v20 , 80(%%r1,%[y_tmp]),1 \n\t" + "vleg %%v21 , 88(%%r1,%[y_tmp]),1 \n\t" - "vleg %%v22 , 96(%%r1,%2),0 \n\t" - "vleg %%v23 , 104(%%r1,%2),0 \n\t" - "vleg %%v22 , 112(%%r1,%2),1 \n\t" - "vleg %%v23 , 120(%%r1,%2),1 \n\t" + "vleg %%v22 , 96(%%r1,%[y_tmp]),0 \n\t" + "vleg %%v23 , 104(%%r1,%[y_tmp]),0 \n\t" + "vleg %%v22 , 112(%%r1,%[y_tmp]),1 \n\t" + "vleg %%v23 , 120(%%r1,%[y_tmp]),1 \n\t" - "vleg %%v24 , 64(%%r1,%1),0 \n\t" - "vleg %%v25 , 72(%%r1,%1),0 \n\t" - "vleg %%v24 , 80(%%r1,%1),1 \n\t" - "vleg %%v25 , 88(%%r1,%1),1 \n\t" + "vleg %%v24 , 64(%%r1,%[x_tmp]),0 \n\t" + "vleg %%v25 , 72(%%r1,%[x_tmp]),0 \n\t" + "vleg %%v24 , 80(%%r1,%[x_tmp]),1 \n\t" + "vleg %%v25 , 88(%%r1,%[x_tmp]),1 \n\t" - "vleg %%v26 , 96(%%r1,%1),0 \n\t" - "vleg %%v27 , 104(%%r1,%1),0 \n\t" - "vleg %%v26 , 112(%%r1,%1),1 \n\t" - "vleg %%v27 , 120(%%r1,%1),1 \n\t" + "vleg %%v26 , 96(%%r1,%[x_tmp]),0 \n\t" + "vleg %%v27 , 104(%%r1,%[x_tmp]),0 \n\t" + "vleg %%v26 , 112(%%r1,%[x_tmp]),1 \n\t" + "vleg %%v27 , 120(%%r1,%[x_tmp]),1 \n\t" #if !defined(CONJ) "vfmsdb %%v20, %%v25, %%v29,%%v20 \n\t" "vfmadb %%v21, %%v24, %%v29, %%v21 \n\t" @@ -128,21 +128,21 @@ static void __attribute__ ((noinline)) zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOA "vfmadb %%v22, %%v26, %%v28, %%v22 \n\t" "vfmsdb %%v23, %%v26, %%v29, %%v23 \n\t" #endif - "vsteg %%v20 , 64(%%r1,%2),0 \n\t" - "vsteg %%v21 , 72(%%r1,%2),0 \n\t" - "vsteg %%v20 , 80(%%r1,%2),1 \n\t" - "vsteg %%v21 , 88(%%r1,%2),1 \n\t" + "vsteg %%v20 , 64(%%r1,%[y_tmp]),0 \n\t" + "vsteg %%v21 , 72(%%r1,%[y_tmp]),0 \n\t" + "vsteg %%v20 , 80(%%r1,%[y_tmp]),1 \n\t" + "vsteg %%v21 , 88(%%r1,%[y_tmp]),1 \n\t" - "vsteg %%v22 , 96(%%r1,%2),0 \n\t" - "vsteg %%v23 , 104(%%r1,%2),0 \n\t" - "vsteg %%v22 , 112(%%r1,%2),1 \n\t" - "vsteg %%v23 , 120(%%r1,%2),1 \n\t" + "vsteg %%v22 , 96(%%r1,%[y_tmp]),0 \n\t" + "vsteg %%v23 , 104(%%r1,%[y_tmp]),0 \n\t" + "vsteg %%v22 , 112(%%r1,%[y_tmp]),1 \n\t" + "vsteg %%v23 , 120(%%r1,%[y_tmp]),1 \n\t" - "la %%r1,128(%%r1) \n\t" - "brctg %3,1b" - : - : "r"(n), "a"(x), "a"(y), "a"(alpha) - : "cc", "memory", "r1","v16", + "la %%r1,128(%%r1) \n\t" + "clgrjl %%r1,%[tmp],1b \n\t" + : [mem_y] "+m" (*(double (*)[2*n])y),[tmp]"+&r"(n) + : [mem_x] "m" (*(const double (*)[2*n])x), [x_tmp] "a"(x), [y_tmp] "a"(y), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) + : "cc", "r1","v16", "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29" ); @@ -151,7 +151,6 @@ static void __attribute__ ((noinline)) zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOA int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0; BLASLONG ix = 0, iy = 0; - FLOAT da[2]; if (n <= 0) return (0); @@ -159,10 +158,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG n1 = n & -8; - if (n1) { - da[0] = da_r; - da[1] = da_i; - zaxpy_kernel_8(n1, x, y, da); + if (n1) { + zaxpy_kernel_8(n1, x, y, da_r,da_i); ix = 2 * n1; } i = n1; diff --git a/kernel/zarch/zcopy.c b/kernel/zarch/zcopy.c index 9123830ea..b5bf383f7 100644 --- a/kernel/zarch/zcopy.c +++ b/kernel/zarch/zcopy.c @@ -27,64 +27,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void __attribute__ ((noinline)) zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { +static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__ volatile( - "pfd 1, 0(%1) \n\t" - "pfd 2, 0(%2) \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" + "pfd 1, 0(%[ptr_x]) \n\t" + "pfd 2, 0(%[ptr_y]) \n\t" + "srlg %[n_tmp],%[n_tmp],4 \n\t" + "xgr %%r1,%%r1 \n\t" ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%%r1,%1) \n\t" - "pfd 2, 256(%%r1,%2) \n\t" + "1: \n\t" + "pfd 1, 256(%%r1,%[ptr_x]) \n\t" + "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vst %%v24, 0(%%r1,%2) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vst %%v25, 16(%%r1,%2) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vst %%v26, 32(%%r1,%2) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vst %%v27, 48(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" + "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" + "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" + "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" + "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" + "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" + "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" + "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" - "vl %%v28, 64(%%r1,%1) \n\t" - "vst %%v28, 64(%%r1,%2) \n\t" - "vl %%v29, 80(%%r1,%1) \n\t" - "vst %%v29, 80(%%r1,%2) \n\t" - "vl %%v30, 96(%%r1,%1) \n\t" - "vst %%v30, 96(%%r1,%2) \n\t" - "vl %%v31,112(%%r1,%1) \n\t" - "vst %%v31,112(%%r1,%2) \n\t" + "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" + "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" + "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" + "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" + "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" + "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" + "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" + "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" - "vl %%v24,128(%%r1,%1) \n\t" - "vst %%v24,128(%%r1,%2) \n\t" + "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" + "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - "vl %%v25,144(%%r1,%1) \n\t" - "vst %%v25,144(%%r1,%2) \n\t" + "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" + "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - "vl %%v26,160(%%r1,%1) \n\t" - "vst %%v26,160(%%r1,%2) \n\t" + "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" + "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - "vl %%v27,176(%%r1,%1) \n\t" - "vst %%v27,176(%%r1,%2) \n\t" + "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" + "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vl %%v28, 192(%%r1,%1) \n\t" - "vst %%v28, 192(%%r1,%2) \n\t" - "vl %%v29, 208(%%r1,%1) \n\t" - "vst %%v29, 208(%%r1,%2) \n\t" - "vl %%v30, 224(%%r1,%1) \n\t" - "vst %%v30, 224(%%r1,%2) \n\t" - "vl %%v31, 240(%%r1,%1) \n\t" - "vst %%v31, 240(%%r1,%2) \n\t" - "la %%r1,256(%%r1) \n\t" - "brctg %%r0,1b" - : - : "r"(n), "a"(x), "a"(y) - : "cc", "memory","r0","r1","v24","v25","v26","v27","v28","v29","v30","v31" + "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" + "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" + "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" + "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" + "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" + "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" + "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" + "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" + "la %%r1,256(%%r1) \n\t" + "brctg %[n_tmp],1b" + : [mem_y] "=m" (*(double (*)[2*n])y), [n_tmp] "+&r"(n) + : [mem_x] "m" (*(const double (*)[2*n])x), [ptr_x] "a"(x), [ptr_y] "a"(y) + : "cc", "r1", "v24","v25","v26","v27","v28","v29","v30","v31" ); - return; + return; } diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index cfa4549ea..61c5d6b98 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -32,75 +32,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { __asm__ volatile( - "pfd 1, 0(%2) \n\t" - "pfd 1, 0(%3) \n\t" + "pfd 1, 0(%[ptr_x_tmp]) \n\t" + "pfd 1, 0(%[ptr_y_tmp]) \n\t" "vzero %%v24 \n\t" "vzero %%v25 \n\t" "vzero %%v26 \n\t" "vzero %%v27 \n\t" - "srlg %1,%1,3 \n\t" - "xgr %%r1,%%r1 \n\t" + "srlg %[n_tmp],%[n_tmp],3 \n\t" + "xgr %%r1,%%r1 \n\t" ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%%r1,%2) \n\t" - "pfd 1, 256(%%r1,%3) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - "vl %%v28, 0(%%r1,%3) \n\t" - "vl %%v29, 16(%%r1,%3) \n\t" - "vl %%v30, 32(%%r1,%3) \n\t" - "vl %%v31, 48(%%r1,%3) \n\t" - "vpdi %%v20,%%v16,%%v16,4 \n\t" - "vpdi %%v21,%%v17,%%v17,4 \n\t" - "vpdi %%v22,%%v18,%%v18,4 \n\t" - "vpdi %%v23,%%v19,%%v19,4 \n\t" + "1: \n\t" + "pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t" + "pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t" + "vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t" + "vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t" + "vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t" + "vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t" + "vpdi %%v20,%%v16,%%v16,4 \n\t" + "vpdi %%v21,%%v17,%%v17,4 \n\t" + "vpdi %%v22,%%v18,%%v18,4 \n\t" + "vpdi %%v23,%%v19,%%v19,4 \n\t" - "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" - "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" - "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" - "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" - "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" - "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" - "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" - "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" + "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" + "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" + "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" + "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" + "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" + "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" + "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" + "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" - "vl %%v16, 64(%%r1,%2) \n\t" - "vl %%v17, 80(%%r1,%2) \n\t" - "vl %%v18, 96(%%r1,%2) \n\t" - "vl %%v19,112(%%r1,%2) \n\t" - "vl %%v28, 64(%%r1,%3) \n\t" - "vl %%v29, 80(%%r1,%3) \n\t" - "vl %%v30, 96(%%r1,%3) \n\t" - "vl %%v31,112(%%r1,%3) \n\t" - "vpdi %%v20,%%v16,%%v16,4 \n\t" - "vpdi %%v21,%%v17,%%v17,4 \n\t" - "vpdi %%v22,%%v18,%%v18,4 \n\t" - "vpdi %%v23,%%v19,%%v19,4 \n\t" - "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" - "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" - "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" - "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" - "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" - "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" - "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" - "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" + "vl %%v16, 64(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v19,112(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t" + "vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t" + "vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t" + "vl %%v31,112(%%r1,%[ptr_y_tmp]) \n\t" + "vpdi %%v20,%%v16,%%v16,4 \n\t" + "vpdi %%v21,%%v17,%%v17,4 \n\t" + "vpdi %%v22,%%v18,%%v18,4 \n\t" + "vpdi %%v23,%%v19,%%v19,4 \n\t" + "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" + "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" + "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" + "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" + "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" + "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" + "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" + "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" - "la %%r1,128(%%r1) \n\t" - "brctg %1,1b \n\t" - "vfadb %%v24,%%v26,%%v24 \n\t" - "vfadb %%v25,%%v25,%%v27 \n\t" - "vsteg %%v24,0(%4),0 \n\t" - "vsteg %%v24,8(%4),1 \n\t" - "vsteg %%v25,16(%4),1 \n\t" - "vsteg %%v25,24(%4),0 \n\t" - : "=m"(*d) ,"+&r"(n) - : "a"(x), "a"(y), "a"(d) + "la %%r1,128(%%r1) \n\t" + "brctg %[n_tmp],1b \n\t" + "vfadb %%v24,%%v26,%%v24 \n\t" + "vfadb %%v25,%%v25,%%v27 \n\t" + "vsteg %%v24, 0(%[ptr_d]),0 \n\t" + "vsteg %%v24, 8(%[ptr_d]),1 \n\t" + "vsteg %%v25,16(%[ptr_d]),1 \n\t" + "vsteg %%v25,24(%[ptr_d]),0 \n\t" + : [mem_out] "=m"(*(double (*)[4])d ) ,[n_tmp] "+&r"(n) + : [mem_x] "m"( *(const double (*)[2*n])x), + [mem_y] "m"( *(const double (*)[2*n])y), + [ptr_x_tmp] "a"(x), [ptr_y_tmp] "a"(y), [ptr_d] "a"(d) : "cc", "r1","v16", "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); @@ -150,8 +152,8 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { #endif OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i; - BLASLONG ix, iy; + BLASLONG i = 0; + BLASLONG ix=0, iy=0; OPENBLAS_COMPLEX_FLOAT result; FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; @@ -164,13 +166,15 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -16; + BLASLONG n1 = n & -8; + BLASLONG j=0; - if (n1) + if (n1){ zdot_kernel_8(n1, x, y, dot); - - i = n1; - BLASLONG j = i * 2; + i = n1; + j = n1 <<1; + } + while (i < n) { diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index dd5574850..380f0140e 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -24,41 +24,41 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" -static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) +static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA) { __asm__ ( - "pfd 2, 0(%1) \n\t" - "pfd 2, 0(%2) \n\t" - - "vlrepg %%v0,0(%3) \n\t" - "vlrepg %%v1,0(%4) \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" + "pfd 2, 0(%[ptr_x]) \n\t" + "pfd 2, 0(%[ptr_y]) \n\t" + "lgdr %%r1,%[cos] \n\t" + "vlvgp %%v0,%%r1,%%r1 \n\t" + "lgdr %%r1,%[sin] \n\t" + "vlvgp %%v1,%%r1,%%r1 \n\t" + "sllg %[tmp],%[tmp],4 \n\t" + "xgr %%r1,%%r1 \n\t" ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%1) \n\t" - "pfd 2, 256(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" + "1: \n\t" + "pfd 2, 256(%%r1,%[ptr_x]) \n\t" + "pfd 2, 256(%%r1,%[ptr_y]) \n\t" + "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" + "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" + "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" + "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" + "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" + "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" + "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ /* 2nd parts*/ "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ @@ -68,35 +68,33 @@ static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLO "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%[ptr_x]) \n\t" + "vst %%v29, 16(%%r1,%[ptr_x]) \n\t" + "vst %%v30, 32(%%r1,%[ptr_x]) \n\t" + "vst %%v31, 48(%%r1,%[ptr_x]) \n\t" + "vst %%v20, 0(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 16(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 32(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 48(%%r1,%[ptr_y]) \n\t" + "vl %%v24, 64(%%r1,%[ptr_x]) \n\t" + "vl %%v25, 80(%%r1,%[ptr_x]) \n\t" + "vl %%v26, 96(%%r1,%[ptr_x]) \n\t" + "vl %%v27,112(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 64(%%r1,%[ptr_y]) \n\t" + "vl %%v17, 80(%%r1,%[ptr_y]) \n\t" + "vl %%v18, 96(%%r1,%[ptr_y]) \n\t" + "vl %%v19,112(%%r1,%[ptr_y]) \n\t" - - "vst %%v28, 0(%%r1,%1) \n\t" - "vst %%v29, 16(%%r1,%1) \n\t" - "vst %%v30, 32(%%r1,%1) \n\t" - "vst %%v31, 48(%%r1,%1) \n\t" - "vst %%v20, 0(%%r1,%2) \n\t" - "vst %%v21, 16(%%r1,%2) \n\t" - "vst %%v22, 32(%%r1,%2) \n\t" - "vst %%v23, 48(%%r1,%2) \n\t" - - "vl %%v24, 64(%%r1,%1) \n\t" - "vl %%v25, 80(%%r1,%1) \n\t" - "vl %%v26, 96(%%r1,%1) \n\t" - "vl %%v27,112(%%r1,%1) \n\t" - "vl %%v16, 64(%%r1,%2) \n\t" - "vl %%v17, 80(%%r1,%2) \n\t" - "vl %%v18, 96(%%r1,%2) \n\t" - "vl %%v19,112(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ /* 2nd parts*/ "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ @@ -106,35 +104,33 @@ static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLO "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%[ptr_x]) \n\t" + "vst %%v29, 80(%%r1,%[ptr_x]) \n\t" + "vst %%v30, 96(%%r1,%[ptr_x]) \n\t" + "vst %%v31, 112(%%r1,%[ptr_x]) \n\t" + "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" + "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" + "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" + "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" + "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" + "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" + "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" + "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" - - "vst %%v28, 64(%%r1,%1) \n\t" - "vst %%v29, 80(%%r1,%1) \n\t" - "vst %%v30, 96(%%r1,%1) \n\t" - "vst %%v31, 112(%%r1,%1) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ /* 2nd parts*/ "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ @@ -144,35 +140,33 @@ static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLO "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%[ptr_x]) \n\t" + "vst %%v29, 144(%%r1,%[ptr_x]) \n\t" + "vst %%v30, 160(%%r1,%[ptr_x]) \n\t" + "vst %%v31, 176(%%r1,%[ptr_x]) \n\t" + "vst %%v20, 128(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 144(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 160(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 176(%%r1,%[ptr_y]) \n\t" + "vl %%v24, 192(%%r1,%[ptr_x]) \n\t" + "vl %%v25, 208(%%r1,%[ptr_x]) \n\t" + "vl %%v26, 224(%%r1,%[ptr_x]) \n\t" + "vl %%v27, 240(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 192(%%r1,%[ptr_y]) \n\t" + "vl %%v17, 208(%%r1,%[ptr_y]) \n\t" + "vl %%v18, 224(%%r1,%[ptr_y]) \n\t" + "vl %%v19, 240(%%r1,%[ptr_y]) \n\t" - - "vst %%v28, 128(%%r1,%1) \n\t" - "vst %%v29, 144(%%r1,%1) \n\t" - "vst %%v30, 160(%%r1,%1) \n\t" - "vst %%v31, 176(%%r1,%1) \n\t" - "vst %%v20, 128(%%r1,%2) \n\t" - "vst %%v21, 144(%%r1,%2) \n\t" - "vst %%v22, 160(%%r1,%2) \n\t" - "vst %%v23, 176(%%r1,%2) \n\t" - - "vl %%v24, 192(%%r1,%1) \n\t" - "vl %%v25, 208(%%r1,%1) \n\t" - "vl %%v26, 224(%%r1,%1) \n\t" - "vl %%v27, 240(%%r1,%1) \n\t" - "vl %%v16, 192(%%r1,%2) \n\t" - "vl %%v17, 208(%%r1,%2) \n\t" - "vl %%v18, 224(%%r1,%2) \n\t" - "vl %%v19, 240(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ /* 2nd parts*/ "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ @@ -182,33 +176,29 @@ static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLO "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - - - "vst %%v28, 192(%%r1,%1) \n\t" - "vst %%v29, 208(%%r1,%1) \n\t" - "vst %%v30, 224(%%r1,%1) \n\t" - "vst %%v31, 240(%%r1,%1) \n\t" - "vst %%v20, 192(%%r1,%2) \n\t" - "vst %%v21, 208(%%r1,%2) \n\t" - "vst %%v22, 224(%%r1,%2) \n\t" - "vst %%v23, 240(%%r1,%2) \n\t" - - - - "la %%r1,256(%%r1) \n\t" - "brctg %%r0,1b" - : - : "r"(n), "a"(x), "a"(y),"a"(c),"a"(s) - : "cc", "memory","r0","r1" ,"v0","v1","v16", + + "vst %%v28, 192(%%r1,%[ptr_x]) \n\t" + "vst %%v29, 208(%%r1,%[ptr_x]) \n\t" + "vst %%v30, 224(%%r1,%[ptr_x]) \n\t" + "vst %%v31, 240(%%r1,%[ptr_x]) \n\t" + "vst %%v20, 192(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 208(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 224(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 240(%%r1,%[ptr_y]) \n\t" + + "la %%r1,256(%%r1) \n\t" + "clgrjl %%r1,%[tmp],1b \n\t" + : [mem_x] "+m" (*(double (*)[2*n])x), + [mem_y] "+m" (*(double (*)[2*n])y), + [tmp] "+&r"(n) + : [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA) + : "cc","r1" ,"v0","v1","v16", "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return; } - - int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) { BLASLONG i=0; @@ -224,11 +214,8 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT BLASLONG n1 = n & -16; if ( n1 > 0 ) - { - FLOAT cosa,sina; - cosa=c; - sina=s; - zrot_kernel_16(n1, x, y, &cosa, &sina); + { + zrot_kernel_16(n1, x, y, c, s); i=n1; ix=2*n1; } @@ -247,7 +234,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } - } else { @@ -273,4 +259,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } - diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index b46f925bb..31f32d8a6 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -29,229 +29,218 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -static void __attribute__ ((noinline)) zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { +static void zscal_kernel_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x) { __asm__( - "pfd 1, 0(%1) \n\t" - "sllg %%r0,%0,4 \n\t" - "agr %%r0,%2 \n\t" - "vlrepg %%v24,0(%1) \n\t" - "vlrepg %%v25,8(%1) \n\t" + "pfd 1, 0(%[x_ptr]) \n\t" + "lgdr %%r0,%[alpha_r] \n\t" + "vlvgp %%v24,%%r0,%%r0 \n\t" + "lgdr %%r0,%[alpha_i] \n\t" + "vlvgp %%v25,%%r0,%%r0 \n\t" + "sllg %%r0,%[n],4 \n\t" + "agr %%r0,%[x_ptr] \n\t" ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%2 ) \n\t" - - "vleg %%v20 , 0(%2),0 \n\t" - "vleg %%v21 , 8(%2),0 \n\t" - "vleg %%v20 , 16(%2),1 \n\t" - "vleg %%v21 , 24(%2),1 \n\t" - - "vleg %%v22 , 32(%2),0 \n\t" - "vleg %%v23 , 40(%2),0 \n\t" - "vleg %%v22 , 48(%2),1 \n\t" - "vleg %%v23 , 56(%2),1 \n\t" - - "vfmdb %%v16, %%v21, %%v25 \n\t" - "vfmdb %%v17, %%v20, %%v25 \n\t" - "vfmdb %%v18, %%v23, %%v25 \n\t" - "vfmdb %%v19, %%v22, %%v25 \n\t" + "1: \n\t" + "pfd 2, 256(%[x_ptr] ) \n\t" + "vleg %%v20 , 0(%[x_ptr]),0 \n\t" + "vleg %%v21 , 8(%[x_ptr]),0 \n\t" + "vleg %%v20 , 16(%[x_ptr]),1 \n\t" + "vleg %%v21 , 24(%[x_ptr]),1 \n\t" + "vleg %%v22 , 32(%[x_ptr]),0 \n\t" + "vleg %%v23 , 40(%[x_ptr]),0 \n\t" + "vleg %%v22 , 48(%[x_ptr]),1 \n\t" + "vleg %%v23 , 56(%[x_ptr]),1 \n\t" + "vfmdb %%v16, %%v21, %%v25 \n\t" + "vfmdb %%v17, %%v20, %%v25 \n\t" + "vfmdb %%v18, %%v23, %%v25 \n\t" + "vfmdb %%v19, %%v22, %%v25 \n\t" "vfmsdb %%v16, %%v20, %%v24 ,%%v16 \n\t" "vfmadb %%v17, %%v21, %%v24, %%v17 \n\t" "vfmsdb %%v18, %%v22, %%v24, %%v18 \n\t" "vfmadb %%v19, %%v23, %%v24, %%v19 \n\t" - - "vsteg %%v16 , 0(%2),0 \n\t" - "vsteg %%v17 , 8(%2),0 \n\t" - "vsteg %%v16 , 16(%2),1 \n\t" - "vsteg %%v17 , 24(%2),1 \n\t" - - "vsteg %%v18 , 32(%2),0 \n\t" - "vsteg %%v19 , 40(%2),0 \n\t" - "vsteg %%v18 , 48(%2),1 \n\t" - "vsteg %%v19 , 56(%2),1 \n\t" - - "vleg %%v20 , 64(%2),0 \n\t" - "vleg %%v21 , 72(%2),0 \n\t" - "vleg %%v20 , 80(%2),1 \n\t" - "vleg %%v21 , 88(%2),1 \n\t" - - "vleg %%v22 , 96(%2),0 \n\t" - "vleg %%v23 , 104(%2),0 \n\t" - "vleg %%v22 , 112(%2),1 \n\t" - "vleg %%v23 , 120(%2),1 \n\t" - - "vfmdb %%v16, %%v21, %%v25 \n\t" - "vfmdb %%v17, %%v20, %%v25 \n\t" - "vfmdb %%v18, %%v23, %%v25 \n\t" - "vfmdb %%v19, %%v22, %%v25 \n\t" - + "vsteg %%v16 , 0(%[x_ptr]),0 \n\t" + "vsteg %%v17 , 8(%[x_ptr]),0 \n\t" + "vsteg %%v16 , 16(%[x_ptr]),1 \n\t" + "vsteg %%v17 , 24(%[x_ptr]),1 \n\t" + "vsteg %%v18 , 32(%[x_ptr]),0 \n\t" + "vsteg %%v19 , 40(%[x_ptr]),0 \n\t" + "vsteg %%v18 , 48(%[x_ptr]),1 \n\t" + "vsteg %%v19 , 56(%[x_ptr]),1 \n\t" + "vleg %%v20 , 64(%[x_ptr]),0 \n\t" + "vleg %%v21 , 72(%[x_ptr]),0 \n\t" + "vleg %%v20 , 80(%[x_ptr]),1 \n\t" + "vleg %%v21 , 88(%[x_ptr]),1 \n\t" + "vleg %%v22 , 96(%[x_ptr]),0 \n\t" + "vleg %%v23 , 104(%[x_ptr]),0 \n\t" + "vleg %%v22 , 112(%[x_ptr]),1 \n\t" + "vleg %%v23 , 120(%[x_ptr]),1 \n\t" + "vfmdb %%v16, %%v21, %%v25 \n\t" + "vfmdb %%v17, %%v20, %%v25 \n\t" + "vfmdb %%v18, %%v23, %%v25 \n\t" + "vfmdb %%v19, %%v22, %%v25 \n\t" "vfmsdb %%v16, %%v20, %%v24 ,%%v16 \n\t" "vfmadb %%v17, %%v21, %%v24, %%v17 \n\t" "vfmsdb %%v18, %%v22, %%v24, %%v18 \n\t" "vfmadb %%v19, %%v23, %%v24, %%v19 \n\t" - - "vsteg %%v16 , 64(%2),0 \n\t" - "vsteg %%v17 , 72(%2),0 \n\t" - "vsteg %%v16 , 80(%2),1 \n\t" - "vsteg %%v17 , 88(%2),1 \n\t" - - "vsteg %%v18 , 96(%2),0 \n\t" - "vsteg %%v19 , 104(%2),0 \n\t" - "vsteg %%v18 , 112(%2),1 \n\t" - "vsteg %%v19 , 120(%2),1 \n\t" + "vsteg %%v16 , 64(%[x_ptr]),0 \n\t" + "vsteg %%v17 , 72(%[x_ptr]),0 \n\t" + "vsteg %%v16 , 80(%[x_ptr]),1 \n\t" + "vsteg %%v17 , 88(%[x_ptr]),1 \n\t" + "vsteg %%v18 , 96(%[x_ptr]),0 \n\t" + "vsteg %%v19 , 104(%[x_ptr]),0 \n\t" + "vsteg %%v18 , 112(%[x_ptr]),1 \n\t" + "vsteg %%v19 , 120(%[x_ptr]),1 \n\t" - "la %2,128(%2) \n\t" - "clgrjl %2,%%r0,1b \n\t" - : - : "r"(n), "a"(alpha), "a"(x) + "la %[x_ptr],128(%[x_ptr]) \n\t" + "clgrjl %[x_ptr],%%r0,1b \n\t" + : [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x) + : [n] "r"(n), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) : "cc", "memory","r0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25" ); } -static void __attribute__ ((noinline)) zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { +static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT da_i, FLOAT *x) { - __asm__ ( "pfd 2, 0(%1) \n\t" - "ld %%f0,8(%2) \n\t" - "lcdbr %%f1,%%f0 \n\t" - "lgdr %%r0,%%f1 \n\t" - "vlvgg %%v0,%%r0,1 \n\t" - "vlr %%v16,%%v0 \n\t" - "vlr %%v17 ,%%v0 \n\t" - "vlr %%v1,%%v0 \n\t" - "sllg %%r0,%0,4 \n\t" - "agr %%r0,%1 \n\t" + __asm__ ( "pfd 2, 0(%1) \n\t" + "lgdr %%r0,%[alpha] \n\t" + "vlvgg %%v16,%%r0,0 \n\t" + "lcdbr %[alpha],%[alpha] \n\t" + "lgdr %%r0,%[alpha] \n\t" + "vlvgg %%v16,%%r0,1 \n\t" + "vlr %%v17 ,%%v16 \n\t" + "sllg %%r0,%[n],4 \n\t" + "agr %%r0,%[x_ptr] \n\t" ".align 16 \n\t" - "1: \n\t" - "vl %%v24, 0(%1) \n\t" - "vfmdb %%v24,%%v24,%%v0 \n\t" - "vsteg %%v24, 0(%1),1 \n\t" - "vsteg %%v24, 8(%1),0 \n\t" - "vl %%v25, 16(%1) \n\t" - "vfmdb %%v25,%%v25,%%v1 \n\t" - "vsteg %%v25, 16(%1),1 \n\t" - "vsteg %%v25, 24(%1),0 \n\t" - "vl %%v26, 32(%1) \n\t" - "vfmdb %%v26,%%v26,%%v16 \n\t" - "vsteg %%v26, 32(%1),1 \n\t" - "vsteg %%v26, 40(%1),0 \n\t" - "vl %%v27, 48(%1) \n\t" + "1: \n\t" + "vl %%v24, 0(%[x_ptr]) \n\t" + "vfmdb %%v24,%%v24,%%v16 \n\t" + "vsteg %%v24, 0(%[x_ptr]),1 \n\t" + "vsteg %%v24, 8(%[x_ptr]),0 \n\t" + "vl %%v25, 16(%[x_ptr]) \n\t" + "vfmdb %%v25,%%v25,%%v17 \n\t" + "vsteg %%v25, 16(%[x_ptr]),1 \n\t" + "vsteg %%v25, 24(%[x_ptr]),0 \n\t" + "vl %%v26, 32(%[x_ptr]) \n\t" + "vfmdb %%v26,%%v26,%%v16 \n\t" + "vsteg %%v26, 32(%[x_ptr]),1 \n\t" + "vsteg %%v26, 40(%[x_ptr]),0 \n\t" + "vl %%v27, 48(%[x_ptr]) \n\t" "vfmdb %%v27,%%v27,%%v17 \n\t" - "vsteg %%v27, 40(%1),1 \n\t" - "vsteg %%v27, 48(%1),0 \n\t" - "vl %%v28, 64(%1) \n\t" - "vfmdb %%v28,%%v28,%%v0 \n\t" - "vsteg %%v28, 64(%1),1 \n\t" - "vsteg %%v28, 72(%1),0 \n\t" - "vl %%v29, 80(%1) \n\t" - "vfmdb %%v29,%%v29,%%v1 \n\t" - "vsteg %%v29, 80(%1),1 \n\t" - "vsteg %%v29, 88(%1),0 \n\t" - "vl %%v30, 96(%1) \n\t" - "vfmdb %%v30,%%v30,%%v16 \n\t" - "vsteg %%v27, 96(%1),1 \n\t" - "vsteg %%v27, 104(%1),0 \n\t" - "vl %%v31, 112(%1) \n\t" + "vsteg %%v27, 40(%[x_ptr]),1 \n\t" + "vsteg %%v27, 48(%[x_ptr]),0 \n\t" + "vl %%v28, 64(%[x_ptr]) \n\t" + "vfmdb %%v28,%%v28,%%v16 \n\t" + "vsteg %%v28, 64(%[x_ptr]),1 \n\t" + "vsteg %%v28, 72(%[x_ptr]),0 \n\t" + "vl %%v29, 80(%[x_ptr]) \n\t" + "vfmdb %%v29,%%v29,%%v17 \n\t" + "vsteg %%v29, 80(%[x_ptr]),1 \n\t" + "vsteg %%v29, 88(%[x_ptr]),0 \n\t" + "vl %%v30, 96(%[x_ptr]) \n\t" + "vfmdb %%v30,%%v30,%%v16 \n\t" + "vsteg %%v27, 96(%[x_ptr]),1 \n\t" + "vsteg %%v27, 104(%[x_ptr]),0 \n\t" + "vl %%v31, 112(%[x_ptr]) \n\t" "vfmdb %%v31,%%v31,%%v17 \n\t" - "vsteg %%v31, 112(%1),1 \n\t" - "vsteg %%v31, 120(%1),0 \n\t" - "la %1,128(%1) \n\t" - "clgrjl %1,%%r0,1b \n\t" - : - :"r"(n),"a"(x) ,"a"(alpha) - :"cc", "memory","r0","f0", "f1","v0","v1","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31" + "vsteg %%v31, 112(%[x_ptr]),1 \n\t" + "vsteg %%v31, 120(%[x_ptr]),0 \n\t" + "la %[x_ptr],128(%[x_ptr]) \n\t" + "clgrjl %[x_ptr],%%r0,1b \n\t" + : [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x) + : [n] "r"(n),[alpha] "f"(da_i) + :"cc", "r0","f0", "f1","v16","v17" ,"v24","v25","v26","v27","v28","v29","v30","v31" ); } -static void __attribute__ ((noinline)) zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { - __asm__ ("pfd 2, 0(%1) \n\t" - "vlrepg %%v18,0(%2) \n\t" - "vlr %%v19,%%v18 \n\t" - "vlr %%v16 ,%%v18 \n\t" - "vlr %%v17,%%v18 \n\t" - "sllg %%r0,%0,4 \n\t" - "agr %%r0,%1 \n\t" +static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT da_r, FLOAT *x) { + __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" + "lgdr %%r0,%[alpha] \n\t" + "vlvgp %%v18,%%r0,%%r0 \n\t" + "vlr %%v19,%%v18 \n\t" + "vlr %%v16,%%v18 \n\t" + "vlr %%v17,%%v18 \n\t" + "sllg %%r0,%[n],4 \n\t" + "agr %%r0,%[x_ptr] \n\t" ".align 16 \n\t" - "1: \n\t" - "vl %%v24, 0(%1) \n\t" - "vfmdb %%v24,%%v24,%%v18 \n\t" - "vst %%v24, 0(%1) \n\t" - "vl %%v25, 16(%1) \n\t" - "vfmdb %%v25,%%v25,%%v19 \n\t" - "vst %%v25, 16(%1) \n\t" - "vl %%v26, 32(%1) \n\t" - "vfmdb %%v26,%%v26,%%v16 \n\t" - "vst %%v26, 32(%1) \n\t" - "vl %%v27, 48(%1) \n\t" - "vfmdb %%v27,%%v27,%%v17 \n\t" - "vst %%v27, 48(%1) \n\t" - "vl %%v28, 64(%1) \n\t" - "vfmdb %%v28,%%v28,%%v18 \n\t" - "vst %%v28, 64(%1) \n\t" - "vl %%v29, 80(%1) \n\t" - "vfmdb %%v29,%%v29,%%v19 \n\t" - "vst %%v29, 80(%1) \n\t" - "vl %%v30, 96(%1) \n\t" - "vfmdb %%v30,%%v30,%%v16 \n\t" - "vst %%v30, 96(%1) \n\t" - "vl %%v31, 112(%1) \n\t" - "vfmdb %%v31,%%v31,%%v17 \n\t" - "vst %%v31, 112(%1) \n\t" - "la %1,128(%1) \n\t" - "clgrjl %1,%%r0,1b \n\t" - : - :"r"(n),"a"(x) ,"a"(alpha) - :"cc", "memory","r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31" + "1: \n\t" + "vl %%v24, 0(%[x_ptr]) \n\t" + "vfmdb %%v24,%%v24,%%v18 \n\t" + "vst %%v24, 0(%[x_ptr]) \n\t" + "vl %%v25, 16(%[x_ptr]) \n\t" + "vfmdb %%v25,%%v25,%%v19 \n\t" + "vst %%v25, 16(%[x_ptr]) \n\t" + "vl %%v26, 32(%[x_ptr]) \n\t" + "vfmdb %%v26,%%v26,%%v16 \n\t" + "vst %%v26, 32(%[x_ptr]) \n\t" + "vl %%v27, 48(%[x_ptr]) \n\t" + "vfmdb %%v27,%%v27,%%v17 \n\t" + "vst %%v27, 48(%[x_ptr]) \n\t" + "vl %%v28, 64(%[x_ptr]) \n\t" + "vfmdb %%v28,%%v28,%%v18 \n\t" + "vst %%v28, 64(%[x_ptr]) \n\t" + "vl %%v29, 80(%[x_ptr]) \n\t" + "vfmdb %%v29,%%v29,%%v19 \n\t" + "vst %%v29, 80(%[x_ptr]) \n\t" + "vl %%v30, 96(%[x_ptr]) \n\t" + "vfmdb %%v30,%%v30,%%v16 \n\t" + "vst %%v30, 96(%[x_ptr]) \n\t" + "vl %%v31,112(%[x_ptr]) \n\t" + "vfmdb %%v31,%%v31,%%v17 \n\t" + "vst %%v31,112(%[x_ptr]) \n\t" + "la %[x_ptr],128(%[x_ptr]) \n\t" + "clgrjl %[x_ptr],%%r0,1b \n\t" + : [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x) + : [n] "r"(n),[alpha] "f"(da_r) + : "cc", "r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31" ); } -static void __attribute__ ((noinline)) zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { +static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { - __asm__ ( "pfd 2, 0(%1) \n\t" + __asm__ ( "pfd 2, 0(%[x_ptr]) \n\t" "vzero %%v24 \n\t" "vzero %%v25 \n\t" "vzero %%v26 \n\t" "vzero %%v27 \n\t" - "sllg %%r0,%0,4 \n\t" - "agr %%r0,%1 \n\t" + "sllg %%r0,%[n],4 \n\t" + "agr %%r0,%[x_ptr] \n\t" ".align 16 \n\t" "1: \n\t" - "pfd 2, 256( %1) \n\t" - "vst %%v24, 0( %1) \n\t" - "vst %%v25, 16( %1) \n\t" - "vst %%v26, 32( %1) \n\t" - "vst %%v27, 48( %1) \n\t" - "vst %%v24, 64( %1) \n\t" - "vst %%v25, 80( %1) \n\t" - "vst %%v26, 96( %1) \n\t" - "vst %%v27,112( %1) \n\t" + "pfd 2, 256( %[x_ptr]) \n\t" + "vst %%v24, 0( %[x_ptr]) \n\t" + "vst %%v25, 16( %[x_ptr]) \n\t" + "vst %%v26, 32( %[x_ptr]) \n\t" + "vst %%v27, 48( %[x_ptr]) \n\t" + "vst %%v24, 64( %[x_ptr]) \n\t" + "vst %%v25, 80( %[x_ptr]) \n\t" + "vst %%v26, 96( %[x_ptr]) \n\t" + "vst %%v27,112( %[x_ptr]) \n\t" - "la %1,128(%1) \n\t" - "clgrjl %1,%%r0,1b \n\t" - : - :"r"(n),"a"(x) - :"cc" , "memory" ,"r0","v24","v25","v26","v27" + "la %[x_ptr],128(%[x_ptr]) \n\t" + "clgrjl %[x_ptr],%%r0,1b \n\t" + : [mem] "+m" (*(double (*)[2*n])x),[x_ptr] "+&a"(x) + : [n] "r"(n) + :"cc" ,"r0","v24","v25","v26","v27" ); } -static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) __attribute__ ((noinline)); -static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) { + +static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x) { BLASLONG i; BLASLONG inc_x2 = 2 * inc_x; BLASLONG inc_x3 = inc_x2 + inc_x; - FLOAT t0, t1, t2, t3; - FLOAT da_r = alpha[0]; - FLOAT da_i = alpha[1]; + FLOAT t0, t1, t2, t3; for (i = 0; i < n; i += 4) { t0 = da_r * x[0] - da_i * x[1]; @@ -280,7 +269,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG i = 0, j = 0; FLOAT temp0; FLOAT temp1; - FLOAT alpha[2]; + if (inc_x != 1) { inc_x <<= 1; @@ -372,10 +361,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } else { BLASLONG n1 = n & -8; - if (n1 > 0) { - alpha[0] = da_r; - alpha[1] = da_i; - zscal_kernel_inc_8(n1, alpha, x, inc_x); + if (n1 > 0) { + zscal_kernel_inc_8(n1, da_r,da_i, x, inc_x); j = n1; i = n1 * inc_x; } @@ -401,19 +388,17 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG n1 = n & -8; if (n1 > 0) { - alpha[0] = da_r; - alpha[1] = da_i; if (da_r == 0.0) if (da_i == 0) zscal_kernel_8_zero(n1, x); else - zscal_kernel_8_zero_r(n1, alpha, x); + zscal_kernel_8_zero_r(n1, da_i, x); else if (da_i == 0) - zscal_kernel_8_zero_i(n1, alpha, x); + zscal_kernel_8_zero_i(n1, da_r, x); else - zscal_kernel_8(n1, alpha, x); + zscal_kernel_8(n1, da_r,da_i, x); i = n1 << 1; j = n1; diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index 8ed13e98d..adf2d8a1f 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -29,99 +29,211 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void __attribute__ ((noinline)) zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +#if defined(Z13_SWAP_A) +static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__ volatile( - "pfd 2, 0(%1) \n\t" - "pfd 2, 0(%2) \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" + "pfd 1, 0(%[ptr_x]) \n\t" + "pfd 2, 0(%[ptr_y]) \n\t" + "srlg %[n_tmp],%[n_tmp],4 \n\t" + "xgr %%r1,%%r1 \n\t" ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%1) \n\t" - "pfd 2, 256(%%r1,%2) \n\t" + "1: \n\t" + "pfd 2, 256(%%r1,%[ptr_x]) \n\t" + "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v20, 64(%%r1,%1) \n\t" - "vl %%v21, 80(%%r1,%1) \n\t" - "vl %%v22, 96(%%r1,%1) \n\t" - "vl %%v23, 112(%%r1,%1) \n\t" - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v28, 192(%%r1,%1) \n\t" - "vl %%v29, 208(%%r1,%1) \n\t" - "vl %%v30, 224(%%r1,%1) \n\t" - "vl %%v31, 240(%%r1,%1) \n\t" + "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" + "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" + "vst %%v16, 0(%%r1,%[ptr_x]) \n\t" + "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" + "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" + "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" + "vst %%v17, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "vl %%v4, 64(%%r1,%2) \n\t" - "vl %%v5, 80(%%r1,%2) \n\t" - "vl %%v6, 96(%%r1,%2) \n\t" - "vl %%v7, 112(%%r1,%2) \n\t" - "vst %%v0, 0(%%r1,%1) \n\t" - "vst %%v1, 16(%%r1,%1) \n\t" - "vst %%v2, 32(%%r1,%1) \n\t" - "vst %%v3, 48(%%r1,%1) \n\t" - "vst %%v4, 64(%%r1,%1) \n\t" - "vst %%v5, 80(%%r1,%1) \n\t" - "vst %%v6, 96(%%r1,%1) \n\t" - "vst %%v7, 112(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" + "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" + "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" + "vst %%v18, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v0, 128(%%r1,%2) \n\t" - "vl %%v1, 144(%%r1,%2) \n\t" - "vl %%v2, 160(%%r1,%2) \n\t" - "vl %%v3, 176(%%r1,%2) \n\t" - "vl %%v4, 192(%%r1,%2) \n\t" - "vl %%v5, 208(%%r1,%2) \n\t" - "vl %%v6, 224(%%r1,%2) \n\t" - "vl %%v7, 240(%%r1,%2) \n\t" - "vst %%v0, 128(%%r1,%1) \n\t" - "vst %%v1, 144(%%r1,%1) \n\t" - "vst %%v2, 160(%%r1,%1) \n\t" - "vst %%v3, 176(%%r1,%1) \n\t" - "vst %%v4, 192(%%r1,%1) \n\t" - "vst %%v5, 208(%%r1,%1) \n\t" - "vst %%v6, 224(%%r1,%1) \n\t" - "vst %%v7, 240(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" + "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" + "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" + "vst %%v19, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v16, 0(%%r1,%2) \n\t" - "vst %%v17, 16(%%r1,%2) \n\t" - "vst %%v18, 32(%%r1,%2) \n\t" - "vst %%v19, 48(%%r1,%2) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - "vst %%v24, 128(%%r1,%2) \n\t" - "vst %%v25, 144(%%r1,%2) \n\t" - "vst %%v26, 160(%%r1,%2) \n\t" - "vst %%v27, 176(%%r1,%2) \n\t" - "vst %%v28, 192(%%r1,%2) \n\t" - "vst %%v29, 208(%%r1,%2) \n\t" - "vst %%v30, 224(%%r1,%2) \n\t" - "vst %%v31, 240(%%r1,%2) \n\t" - + "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" + "vl %%v20, 64(%%r1,%[ptr_y]) \n\t" + "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" + "vst %%v20, 64(%%r1,%[ptr_x]) \n\t" + + "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" + "vl %%v21, 80(%%r1,%[ptr_y]) \n\t" + "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 80(%%r1,%[ptr_x]) \n\t" + + "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" + "vl %%v22, 96(%%r1,%[ptr_y]) \n\t" + "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 96(%%r1,%[ptr_x]) \n\t" + + "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" + "vl %%v23, 112(%%r1,%[ptr_y]) \n\t" + "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 112(%%r1,%[ptr_x]) \n\t" + + "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" + "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" + "vst %%v16, 128(%%r1,%[ptr_x]) \n\t" + + "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" + "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" + "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" + "vst %%v17, 144(%%r1,%[ptr_x]) \n\t" + + "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" + "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" + "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" + "vst %%v18, 160(%%r1,%[ptr_x]) \n\t" + + "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" + "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" + "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" + "vst %%v19, 176(%%r1,%[ptr_x]) \n\t" + + "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" + "vl %%v20, 192(%%r1,%[ptr_y]) \n\t" + "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" + "vst %%v20, 192(%%r1,%[ptr_x]) \n\t" + + "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" + "vl %%v21, 208(%%r1,%[ptr_y]) \n\t" + "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 208(%%r1,%[ptr_x]) \n\t" + + "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" + "vl %%v22, 224(%%r1,%[ptr_y]) \n\t" + "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 224(%%r1,%[ptr_x]) \n\t" + + "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" + "vl %%v23, 240(%%r1,%[ptr_y]) \n\t" + "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 240(%%r1,%[ptr_x]) \n\t" - "la %%r1,256(%%r1) \n\t" - "brctg %%r0,1b" - : - : "r"(n), "a"(x), "a"(y) - :"cc", "memory","r0","r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + "la %%r1,256(%%r1) \n\t" + "brctg %[n_tmp],1b" + : [mem_x] "+m" (*(double (*)[2*n])x), + [mem_y] "+m" (*(double (*)[2*n])y), + [n_tmp] "+&r"(n) + : [ptr_x] "a"(x), [ptr_y] "a"(y) + : "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23" + ,"v24","v25","v26","v27","v28","v29","v30","v31" ); return; } +#else + +static void __attribute__ ((noinline)) zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile( + "pfd 2, 0(%[ptr_x]) \n\t" + "pfd 2, 0(%[ptr_y]) \n\t" + "srlg %[n_tmp],%[n_tmp],4 \n\t" + "xgr %%r1,%%r1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 2, 256(%%r1,%[ptr_x]) \n\t" + "pfd 2, 256(%%r1,%[ptr_y]) \n\t" + + "vl %%v16, 0(%%r1,%[ptr_x]) \n\t" + "vl %%v17, 16(%%r1,%[ptr_x]) \n\t" + "vl %%v18, 32(%%r1,%[ptr_x]) \n\t" + "vl %%v19, 48(%%r1,%[ptr_x]) \n\t" + "vl %%v20, 64(%%r1,%[ptr_x]) \n\t" + "vl %%v21, 80(%%r1,%[ptr_x]) \n\t" + "vl %%v22, 96(%%r1,%[ptr_x]) \n\t" + "vl %%v23, 112(%%r1,%[ptr_x]) \n\t" + "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" + "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" + "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" + "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" + "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" + "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" + "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" + "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" + + + "vl %%v0, 0(%%r1,%[ptr_y]) \n\t" + "vl %%v1, 16(%%r1,%[ptr_y]) \n\t" + "vl %%v2, 32(%%r1,%[ptr_y]) \n\t" + "vl %%v3, 48(%%r1,%[ptr_y]) \n\t" + "vl %%v4, 64(%%r1,%[ptr_y]) \n\t" + "vl %%v5, 80(%%r1,%[ptr_y]) \n\t" + "vl %%v6, 96(%%r1,%[ptr_y]) \n\t" + "vl %%v7, 112(%%r1,%[ptr_y]) \n\t" + "vst %%v0, 0(%%r1,%[ptr_x]) \n\t" + "vst %%v1, 16(%%r1,%[ptr_x]) \n\t" + "vst %%v2, 32(%%r1,%[ptr_x]) \n\t" + "vst %%v3, 48(%%r1,%[ptr_x]) \n\t" + "vst %%v4, 64(%%r1,%[ptr_x]) \n\t" + "vst %%v5, 80(%%r1,%[ptr_x]) \n\t" + "vst %%v6, 96(%%r1,%[ptr_x]) \n\t" + "vst %%v7, 112(%%r1,%[ptr_x]) \n\t" + + "vl %%v0, 128(%%r1,%[ptr_y]) \n\t" + "vl %%v1, 144(%%r1,%[ptr_y]) \n\t" + "vl %%v2, 160(%%r1,%[ptr_y]) \n\t" + "vl %%v3, 176(%%r1,%[ptr_y]) \n\t" + "vl %%v4, 192(%%r1,%[ptr_y]) \n\t" + "vl %%v5, 208(%%r1,%[ptr_y]) \n\t" + "vl %%v6, 224(%%r1,%[ptr_y]) \n\t" + "vl %%v7, 240(%%r1,%[ptr_y]) \n\t" + "vst %%v0, 128(%%r1,%[ptr_x]) \n\t" + "vst %%v1, 144(%%r1,%[ptr_x]) \n\t" + "vst %%v2, 160(%%r1,%[ptr_x]) \n\t" + "vst %%v3, 176(%%r1,%[ptr_x]) \n\t" + "vst %%v4, 192(%%r1,%[ptr_x]) \n\t" + "vst %%v5, 208(%%r1,%[ptr_x]) \n\t" + "vst %%v6, 224(%%r1,%[ptr_x]) \n\t" + "vst %%v7, 240(%%r1,%[ptr_x]) \n\t" + + "vst %%v16, 0(%%r1,%[ptr_y]) \n\t" + "vst %%v17, 16(%%r1,%[ptr_y]) \n\t" + "vst %%v18, 32(%%r1,%[ptr_y]) \n\t" + "vst %%v19, 48(%%r1,%[ptr_y]) \n\t" + "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" + "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" + "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" + "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" + "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" + "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" + "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" + "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" + "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" + + + "la %%r1,256(%%r1) \n\t" + "brctg %[n_tmp],1b" + : [mem_x] "+m" (*(double (*)[2*n])x), + [mem_y] "+m" (*(double (*)[2*n])y), + [n_tmp] "+&r"(n) + : [ptr_x] "a"(x), [ptr_y] "a"(y) + : "cc", "memory", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + return; + +} + +#endif +