From 94cd946b963e9e077cb4a4c5d93b1ce691e1fe63 Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 4 Jan 2019 17:45:56 +0200 Subject: [PATCH] [ZARCH] fix cgemv_n_4.c --- kernel/zarch/cgemv_n_4.c | 308 +++++++++++++++++++-------------------- 1 file changed, 154 insertions(+), 154 deletions(-) diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index 4c3253774..c939aea9f 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -34,107 +34,107 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { __asm__ volatile ( - "vlrepg %%v16,0(%5) \n\t" - "vlrepg %%v17,8(%5) \n\t" - "vlrepg %%v18,16(%5) \n\t" - "vlrepg %%v19,24(%5) \n\t" + "vlrepg %%v16,0(%5) \n\t" + "vlrepg %%v17,8(%5) \n\t" + "vlrepg %%v18,16(%5) \n\t" + "vlrepg %%v19,24(%5) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v20,4(%5),0 \n\t" - "vlef %%v20,4(%5),2 \n\t" - "vflcsb %%v20,%%v20 \n\t" - "vlef %%v20,0(%5),1 \n\t" - "vlef %%v20,0(%5),3 \n\t" + "vlef %%v20,4(%5),0 \n\t" + "vlef %%v20,4(%5),2 \n\t" + "vflcsb %%v20,%%v20 \n\t" + "vlef %%v20,0(%5),1 \n\t" + "vlef %%v20,0(%5),3 \n\t" - "vlef %%v21,12(%5),0 \n\t" - "vlef %%v21,12(%5),2 \n\t" - "vflcsb %%v21,%%v21 \n\t" - "vlef %%v21,8(%5),1 \n\t" - "vlef %%v21,8(%5),3 \n\t" + "vlef %%v21,12(%5),0 \n\t" + "vlef %%v21,12(%5),2 \n\t" + "vflcsb %%v21,%%v21 \n\t" + "vlef %%v21,8(%5),1 \n\t" + "vlef %%v21,8(%5),3 \n\t" - "vlef %%v22,20(%5),0 \n\t" - "vlef %%v22,20(%5),2 \n\t" - "vflcsb %%v22,%%v22 \n\t" - "vlef %%v22,16(%5),1 \n\t" - "vlef %%v22,16(%5),3 \n\t" + "vlef %%v22,20(%5),0 \n\t" + "vlef %%v22,20(%5),2 \n\t" + "vflcsb %%v22,%%v22 \n\t" + "vlef %%v22,16(%5),1 \n\t" + "vlef %%v22,16(%5),3 \n\t" - "vlef %%v23,28(%5),0 \n\t" - "vlef %%v23,28(%5),2 \n\t" - "vflcsb %%v23,%%v23 \n\t" - "vlef %%v23,24(%5),1 \n\t" - "vlef %%v23,24(%5),3 \n\t" + "vlef %%v23,28(%5),0 \n\t" + "vlef %%v23,28(%5),2 \n\t" + "vflcsb %%v23,%%v23 \n\t" + "vlef %%v23,24(%5),1 \n\t" + "vlef %%v23,24(%5),3 \n\t" #else - "vlef %%v20,0(%5),1 \n\t" - "vlef %%v20,0(%5),3 \n\t" - "vflcsb %%v20,%%v20 \n\t" - "vlef %%v20,4(%5),0 \n\t" - "vlef %%v20,4(%5),2 \n\t" + "vlef %%v20,0(%5),1 \n\t" + "vlef %%v20,0(%5),3 \n\t" + "vflcsb %%v20,%%v20 \n\t" + "vlef %%v20,4(%5),0 \n\t" + "vlef %%v20,4(%5),2 \n\t" - "vlef %%v21,8(%5),1 \n\t" - "vlef %%v21,8(%5),3 \n\t" - "vflcsb %%v21,%%v21 \n\t" - "vlef %%v21,12(%5),0 \n\t" - "vlef %%v21,12(%5),2 \n\t" + "vlef %%v21,8(%5),1 \n\t" + "vlef %%v21,8(%5),3 \n\t" + "vflcsb %%v21,%%v21 \n\t" + "vlef %%v21,12(%5),0 \n\t" + "vlef %%v21,12(%5),2 \n\t" - "vlef %%v22,16(%5),1 \n\t" - "vlef %%v22,16(%5),3 \n\t" - "vflcsb %%v22,%%v22 \n\t" - "vlef %%v22,20(%5),0 \n\t" - "vlef %%v22,20(%5),2 \n\t" + "vlef %%v22,16(%5),1 \n\t" + "vlef %%v22,16(%5),3 \n\t" + "vflcsb %%v22,%%v22 \n\t" + "vlef %%v22,20(%5),0 \n\t" + "vlef %%v22,20(%5),2 \n\t" - "vlef %%v23,24(%5),1 \n\t" - "vlef %%v23,24(%5),3 \n\t" - "vflcsb %%v23,%%v23 \n\t" - "vlef %%v23,28(%5),0 \n\t" - "vlef %%v23,28(%5),2 \n\t" + "vlef %%v23,24(%5),1 \n\t" + "vlef %%v23,24(%5),3 \n\t" + "vflcsb %%v23,%%v23 \n\t" + "vlef %%v23,28(%5),0 \n\t" + "vlef %%v23,28(%5),2 \n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%%r0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 2,1024(%%r1,%6) \n\t" + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" - "vlef %%v24,0(%%r1,%1),0 \n\t" - "vlef %%v24,0(%%r1,%1),1 \n\t" - "vlef %%v24,8(%%r1,%1),2 \n\t" - "vlef %%v24,8(%%r1,%1),3 \n\t" - "vlef %%v25,4(%%r1,%1),0 \n\t" - "vlef %%v25,4(%%r1,%1),1 \n\t" - "vlef %%v25,12(%%r1,%1),2 \n\t" - "vlef %%v25,12(%%r1,%1),3 \n\t" - "vlef %%v26,0(%%r1,%2),0 \n\t" - "vlef %%v26,0(%%r1,%2),1 \n\t" - "vlef %%v26,8(%%r1,%2),2 \n\t" - "vlef %%v26,8(%%r1,%2),3 \n\t" - "vlef %%v27,4(%%r1,%2),0 \n\t" - "vlef %%v27,4(%%r1,%2),1 \n\t" - "vlef %%v27,12(%%r1,%2),2 \n\t" - "vlef %%v27,12(%%r1,%2),3 \n\t" + "vlef %%v24,0(%%r1,%1),0 \n\t" + "vlef %%v24,0(%%r1,%1),1 \n\t" + "vlef %%v24,8(%%r1,%1),2 \n\t" + "vlef %%v24,8(%%r1,%1),3 \n\t" + "vlef %%v25,4(%%r1,%1),0 \n\t" + "vlef %%v25,4(%%r1,%1),1 \n\t" + "vlef %%v25,12(%%r1,%1),2 \n\t" + "vlef %%v25,12(%%r1,%1),3 \n\t" + "vlef %%v26,0(%%r1,%2),0 \n\t" + "vlef %%v26,0(%%r1,%2),1 \n\t" + "vlef %%v26,8(%%r1,%2),2 \n\t" + "vlef %%v26,8(%%r1,%2),3 \n\t" + "vlef %%v27,4(%%r1,%2),0 \n\t" + "vlef %%v27,4(%%r1,%2),1 \n\t" + "vlef %%v27,12(%%r1,%2),2 \n\t" + "vlef %%v27,12(%%r1,%2),3 \n\t" - "vl %%v0,0(%%r1,%6) \n\t" - "vfmasb %%v0,%%v24,%%v16,%%v0 \n\t" - "vfmasb %%v0,%%v25,%%v20,%%v0 \n\t" - "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" - "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" + "vl %%v0,0(%%r1,%6) \n\t" + "vfmasb %%v0,%%v24,%%v16,%%v0 \n\t" + "vfmasb %%v0,%%v25,%%v20,%%v0 \n\t" + "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" + "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" - "vlef %%v28,0(%%r1,%1),0 \n\t" - "vlef %%v28,0(%%r1,%1),1 \n\t" - "vlef %%v28,8(%%r1,%1),2 \n\t" - "vlef %%v28,8(%%r1,%1),3 \n\t" - "vlef %%v29,4(%%r1,%1),0 \n\t" - "vlef %%v29,4(%%r1,%1),1 \n\t" - "vlef %%v29,12(%%r1,%1),2 \n\t" - "vlef %%v29,12(%%r1,%1),3 \n\t" - "vlef %%v30,0(%%r1,%2),0 \n\t" - "vlef %%v30,0(%%r1,%2),1 \n\t" - "vlef %%v30,8(%%r1,%2),2 \n\t" - "vlef %%v30,8(%%r1,%2),3 \n\t" - "vlef %%v31,4(%%r1,%2),0 \n\t" - "vlef %%v31,4(%%r1,%2),1 \n\t" - "vlef %%v31,12(%%r1,%2),2 \n\t" - "vlef %%v31,12(%%r1,%2),3 \n\t" + "vlef %%v28,0(%%r1,%1),0 \n\t" + "vlef %%v28,0(%%r1,%1),1 \n\t" + "vlef %%v28,8(%%r1,%1),2 \n\t" + "vlef %%v28,8(%%r1,%1),3 \n\t" + "vlef %%v29,4(%%r1,%1),0 \n\t" + "vlef %%v29,4(%%r1,%1),1 \n\t" + "vlef %%v29,12(%%r1,%1),2 \n\t" + "vlef %%v29,12(%%r1,%1),3 \n\t" + "vlef %%v30,0(%%r1,%2),0 \n\t" + "vlef %%v30,0(%%r1,%2),1 \n\t" + "vlef %%v30,8(%%r1,%2),2 \n\t" + "vlef %%v30,8(%%r1,%2),3 \n\t" + "vlef %%v31,4(%%r1,%2),0 \n\t" + "vlef %%v31,4(%%r1,%2),1 \n\t" + "vlef %%v31,12(%%r1,%2),2 \n\t" + "vlef %%v31,12(%%r1,%2),3 \n\t" "vfmasb %%v0,%%v28,%%v18,%%v0 \n\t" "vfmasb %%v0,%%v29,%%v22,%%v0 \n\t" @@ -153,56 +153,56 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { __asm__ volatile ( - "vlrepg %%v16,0(%3) \n\t" - "vlrepg %%v17,8(%3) \n\t" + "vlrepg %%v16,0(%3) \n\t" + "vlrepg %%v17,8(%3) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v18,4(%3),0 \n\t" - "vlef %%v18,4(%3),2 \n\t" - "vflcsb %%v18,%%v18 \n\t" - "vlef %%v18,0(%3),1 \n\t" - "vlef %%v18,0(%3),3 \n\t" + "vlef %%v18,4(%3),0 \n\t" + "vlef %%v18,4(%3),2 \n\t" + "vflcsb %%v18,%%v18 \n\t" + "vlef %%v18,0(%3),1 \n\t" + "vlef %%v18,0(%3),3 \n\t" - "vlef %%v19,12(%3),0 \n\t" - "vlef %%v19,12(%3),2 \n\t" - "vflcsb %%v19,%%v19 \n\t" - "vlef %%v19,8(%3),1 \n\t" - "vlef %%v19,8(%3),3 \n\t" + "vlef %%v19,12(%3),0 \n\t" + "vlef %%v19,12(%3),2 \n\t" + "vflcsb %%v19,%%v19 \n\t" + "vlef %%v19,8(%3),1 \n\t" + "vlef %%v19,8(%3),3 \n\t" #else - "vlef %%v18,0(%3),1 \n\t" - "vlef %%v18,0(%3),3 \n\t" - "vflcsb %%v18,%%v18 \n\t" - "vlef %%v18,4(%3),0 \n\t" - "vlef %%v18,4(%3),2 \n\t" + "vlef %%v18,0(%3),1 \n\t" + "vlef %%v18,0(%3),3 \n\t" + "vflcsb %%v18,%%v18 \n\t" + "vlef %%v18,4(%3),0 \n\t" + "vlef %%v18,4(%3),2 \n\t" - "vlef %%v19,8(%3),1 \n\t" - "vlef %%v19,8(%3),3 \n\t" - "vflcsb %%v19,%%v19 \n\t" - "vlef %%v19,12(%3),0 \n\t" - "vlef %%v19,12(%3),2 \n\t" + "vlef %%v19,8(%3),1 \n\t" + "vlef %%v19,8(%3),3 \n\t" + "vflcsb %%v19,%%v19 \n\t" + "vlef %%v19,12(%3),0 \n\t" + "vlef %%v19,12(%3),2 \n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%%r0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%4) \n\t" + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" - "vlef %%v20,0(%%r1,%1),0 \n\t" - "vlef %%v20,0(%%r1,%1),1 \n\t" - "vlef %%v20,8(%%r1,%1),2 \n\t" - "vlef %%v20,8(%%r1,%1),3 \n\t" - "vlef %%v21,4(%%r1,%1),0 \n\t" - "vlef %%v21,4(%%r1,%1),1 \n\t" - "vlef %%v21,12(%%r1,%1),2 \n\t" - "vlef %%v21,12(%%r1,%1),3 \n\t" - "vlef %%v22,0(%%r1,%2),0 \n\t" - "vlef %%v22,0(%%r1,%2),1 \n\t" - "vlef %%v22,8(%%r1,%2),2 \n\t" - "vlef %%v22,8(%%r1,%2),3 \n\t" - "vlef %%v23,4(%%r1,%2),0 \n\t" - "vlef %%v23,4(%%r1,%2),1 \n\t" - "vlef %%v23,12(%%r1,%2),2 \n\t" - "vlef %%v23,12(%%r1,%2),3 \n\t" + "vlef %%v20,0(%%r1,%1),0 \n\t" + "vlef %%v20,0(%%r1,%1),1 \n\t" + "vlef %%v20,8(%%r1,%1),2 \n\t" + "vlef %%v20,8(%%r1,%1),3 \n\t" + "vlef %%v21,4(%%r1,%1),0 \n\t" + "vlef %%v21,4(%%r1,%1),1 \n\t" + "vlef %%v21,12(%%r1,%1),2 \n\t" + "vlef %%v21,12(%%r1,%1),3 \n\t" + "vlef %%v22,0(%%r1,%2),0 \n\t" + "vlef %%v22,0(%%r1,%2),1 \n\t" + "vlef %%v22,8(%%r1,%2),2 \n\t" + "vlef %%v22,8(%%r1,%2),3 \n\t" + "vlef %%v23,4(%%r1,%2),0 \n\t" + "vlef %%v23,4(%%r1,%2),1 \n\t" + "vlef %%v23,12(%%r1,%2),2 \n\t" + "vlef %%v23,12(%%r1,%2),3 \n\t" "vl %%v0,0(%%r1,%4) \n\t" "vfmasb %%v0,%%v20,%%v16,%%v0 \n\t" @@ -222,34 +222,34 @@ static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { __asm__ volatile ( - "vlrepg %%v16,0(%2) \n\t" + "vlrepg %%v16,0(%2) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vlef %%v17,4(%2),0 \n\t" - "vlef %%v17,4(%2),2 \n\t" + "vlef %%v17,4(%2),2 \n\t" "vflcsb %%v17,%%v17 \n\t" "vlef %%v17,0(%2),1 \n\t" - "vlef %%v17,0(%2),3 \n\t" + "vlef %%v17,0(%2),3 \n\t" #else "vlef %%v17,0(%2),1 \n\t" - "vlef %%v17,0(%2),3 \n\t" + "vlef %%v17,0(%2),3 \n\t" "vflcsb %%v17,%%v17 \n\t" "vlef %%v17,4(%2),0 \n\t" - "vlef %%v17,4(%2),2 \n\t" + "vlef %%v17,4(%2),2 \n\t" #endif "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%%r0,1 \n\t" + "srlg %%r0,%0,1 \n\t" "0: \n\t" "pfd 1,1024(%%r1,%1) \n\t" "pfd 2,1024(%%r1,%3) \n\t" - "vlef %%v18,0(%%r1,%1),0 \n\t" - "vlef %%v18,0(%%r1,%1),1 \n\t" - "vlef %%v18,8(%%r1,%1),2 \n\t" - "vlef %%v18,8(%%r1,%1),3 \n\t" - "vlef %%v19,4(%%r1,%1),0 \n\t" - "vlef %%v19,4(%%r1,%1),1 \n\t" - "vlef %%v19,12(%%r1,%1),2 \n\t" - "vlef %%v19,12(%%r1,%1),3 \n\t" + "vlef %%v18,0(%%r1,%1),0 \n\t" + "vlef %%v18,0(%%r1,%1),1 \n\t" + "vlef %%v18,8(%%r1,%1),2 \n\t" + "vlef %%v18,8(%%r1,%1),3 \n\t" + "vlef %%v19,4(%%r1,%1),0 \n\t" + "vlef %%v19,4(%%r1,%1),1 \n\t" + "vlef %%v19,12(%%r1,%1),2 \n\t" + "vlef %%v19,12(%%r1,%1),3 \n\t" "vl %%v0,0(%%r1,%3) \n\t" "vfmasb %%v0,%%v18,%%v16,%%v0 \n\t" @@ -268,18 +268,18 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT al { __asm__ volatile ( #if !defined(XCONJ) - "vlrepf %%v0,%3 \n\t" - "vlef %%v1,%4,0 \n\t" - "vlef %%v1,%4,2 \n\t" + "vlrepf %%v0,%3 \n\t" + "vlef %%v1,%4,0 \n\t" + "vlef %%v1,%4,2 \n\t" "vflcsb %%v1,%%v1 \n\t" - "vlef %%v1,%4,1 \n\t" + "vlef %%v1,%4,1 \n\t" "vlef %%v1,%4,3 \n\t" #else "vlef %%v0,%3,1 \n\t" - "vlef %%v0,%3,3 \n\t" + "vlef %%v0,%3,3 \n\t" "vflcsb %%v0,%%v0 \n\t" "vlef %%v0,%3,0 \n\t" - "vlef %%v0,%3,2 \n\t" + "vlef %%v0,%3,2 \n\t" "vlrepf %%v1,%4 \n\t" #endif "xgr %%r1,%%r1 \n\t" @@ -292,7 +292,7 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT al "vl %%v17,16(%%r1,%1) \n\t" "vl %%v18,0(%%r1,%2) \n\t" "vl %%v19,16(%%r1,%2) \n\t" - "verllg %%v20,%%v16,32 \n\t" + "verllg %%v20,%%v16,32 \n\t" "verllg %%v21,%%v17,32 \n\t" "vfmasb %%v22,%%v16,%%v0,%%v18 \n\t"