From 69a8aa6de2b7647ce322d633627765677fd25b8f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 3 Mar 2018 18:01:51 +0100 Subject: [PATCH 01/23] Fix transposition of expected and computed values in error message --- utest/test_axpy.c | 16 ++++++++-------- utest/test_dotu.c | 17 ++++++++--------- utest/test_dsdot.c | 2 +- utest/test_rot.c | 16 ++++++++-------- utest/test_swap.c | 16 ++++++++-------- 5 files changed, 33 insertions(+), 34 deletions(-) diff --git a/utest/test_axpy.c b/utest/test_axpy.c index 783755333..603043073 100644 --- a/utest/test_axpy.c +++ b/utest/test_axpy.c @@ -48,8 +48,8 @@ CTEST(axpy,daxpy_inc_0) BLASFUNC(daxpy)(&N,&a,x1,&incX,y1,&incY); for(i=0; i -#include CTEST( zdotu,zdotu_n_1) { @@ -50,11 +49,11 @@ CTEST( zdotu,zdotu_n_1) #endif #ifdef OPENBLAS_COMPLEX_STRUCT - ASSERT_DBL_NEAR_TOL(result1.real, result2.real, DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(result1.imag, result2.imag, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(result2.real, result1.real, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(result2.imag, result1.imag, DOUBLE_EPS); #else - ASSERT_DBL_NEAR_TOL(creal(result1), creal(result2), DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(cimag(result1), cimag(result2), DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(creal(result2), creal(result1), DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(cimag(result2), cimag(result1), DOUBLE_EPS); #endif } @@ -74,11 +73,11 @@ CTEST(zdotu, zdotu_offset_1) #endif #ifdef OPENBLAS_COMPLEX_STRUCT - ASSERT_DBL_NEAR_TOL(result1.real, result2.real, DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(result1.imag, result2.imag, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(result2.real, result1.real, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(result2.imag, result1.imag, DOUBLE_EPS); #else - ASSERT_DBL_NEAR_TOL(creal(result1), creal(result2), DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(cimag(result1), cimag(result2), DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(creal(result2), creal(result1), DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(cimag(result2), cimag(result1), DOUBLE_EPS); #endif } diff --git a/utest/test_dsdot.c b/utest/test_dsdot.c index 7d082a372..d58b398a8 100644 --- a/utest/test_dsdot.c +++ b/utest/test_dsdot.c @@ -44,6 +44,6 @@ CTEST(dsdot,dsdot_n_1) double res1=0.0f, res2=-0.00239335360107; res1=BLASFUNC(dsdot)(&n, &x, &incx, &y, &incy); - ASSERT_DBL_NEAR_TOL(res1, res2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(res2, res1, DOUBLE_EPS); } diff --git a/utest/test_rot.c b/utest/test_rot.c index 2a47b8058..cf72ad22d 100644 --- a/utest/test_rot.c +++ b/utest/test_rot.c @@ -48,8 +48,8 @@ CTEST(rot,drot_inc_0) BLASFUNC(drot)(&N,x1,&incX,y1,&incY,&c,&s); for(i=0; i Date: Sun, 4 Mar 2018 17:39:56 +0100 Subject: [PATCH 02/23] Rewrite ROTMG to address cases not covered by the netlib algorithm (#1480) * Rewrite ROTMG based on the new implementation in GONUM based on the algorithm proposed by Tim Hopkins, see issue 1452 for the reference * Correct ROTMG utest for issue1452 and add another from gonum, also correct transposition of expected and observed values in error messages --- interface/rotmg.c | 126 ++++++++++++++++----------------- utest/test_rotmg.c | 77 ++++++++++++++------ utest/utest_main2.c | 166 +++++++++++++++++++++++++++----------------- 3 files changed, 216 insertions(+), 153 deletions(-) diff --git a/interface/rotmg.c b/interface/rotmg.c index acf7399e1..ce3b146c1 100644 --- a/interface/rotmg.c +++ b/interface/rotmg.c @@ -64,6 +64,13 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ FLOAT du, dp1, dp2, dq2, dq1, dh11=ZERO, dh21=ZERO, dh12=ZERO, dh22=ZERO, dflag=-ONE, dtemp; + if (*dd2 == ZERO || dy1 == ZERO) + { + dflag = -TWO; + dparam[0] = dflag; + return; + } + if(*dd1 < ZERO) { dflag = -ONE; @@ -76,6 +83,16 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ *dd2 = ZERO; *dx1 = ZERO; } + else if ((*dd1 == ZERO || *dx1 == ZERO) && *dd2 > ZERO) + { + dflag = ONE; + dh12 = 1; + dh21 = -1; + *dx1 = dy1; + dtemp = *dd1; + *dd1 = *dd2; + *dd2 = dtemp; + } else { dp2 = *dd2 * dy1; @@ -90,6 +107,9 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ dq1 = dp1 * *dx1; if(ABS(dq1) > ABS(dq2)) { + dflag = ZERO; + dh11 = ONE; + dh22 = ONE; dh21 = - dy1 / *dx1; dh12 = dp2 / dp1; @@ -100,8 +120,19 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ *dd1 = *dd1 / du; *dd2 = *dd2 / du; *dx1 = *dx1 * du; + } else { + dflag = -ONE; + dh11 = ZERO; + dh12 = ZERO; + dh21 = ZERO; + dh22 = ZERO; + + *dd1 = ZERO; + *dd2 = ZERO; + *dx1 = ZERO; } + } else { @@ -120,7 +151,9 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ } else { - dflag = ONE; + dflag = ONE; + dh21 = -ONE; + dh12 = ONE; dh11 = dp1 / dp2; dh22 = *dx1 / dy1; @@ -134,76 +167,33 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ } - if(*dd1 != ZERO) + while ( *dd1 <= RGAMSQ && *dd1 != ZERO) { - if( (*dd1 <= RGAMSQ) || (*dd1 >= GAMSQ) ) - { - if(dflag == ZERO) - { - dh11 = ONE; - dh22 = ONE; - dflag = -ONE; - } - else - { - dh21 = -ONE; - dh12 = ONE; - dflag = -ONE; - } - if( *dd1 <= RGAMSQ ) - { - while (ABS(*dd1) <= RGAMSQ) { - *dd1 = *dd1 * (GAM * GAM); - *dx1 = *dx1 / GAM; - dh11 = dh11 / GAM; - dh12 = dh12 / GAM; - } - } - else - { - while (ABS(*dd1) >= GAMSQ) { - *dd1 = *dd1 / (GAM * GAM); - *dx1 = *dx1 * GAM; - dh11 = dh11 * GAM; - dh12 = dh12 * GAM; - } - } - } + dflag = -ONE; + *dd1 = *dd1 * (GAM * GAM); + *dx1 = *dx1 / GAM; + dh11 = dh11 / GAM; + dh12 = dh12 / GAM; + } + while (ABS(*dd1) > GAMSQ) { + dflag = -ONE; + *dd1 = *dd1 / (GAM * GAM); + *dx1 = *dx1 * GAM; + dh11 = dh11 * GAM; + dh12 = dh12 * GAM; } - if(*dd2 != ZERO) - { - if( (ABS(*dd2) <= RGAMSQ) || (ABS(*dd2) >= GAMSQ) ) - { - if(dflag == ZERO) - { - dh11 = ONE; - dh22 = ONE; - dflag = -ONE; - } - else - { - dh21 = -ONE; - dh12 = ONE; - dflag = -ONE; - } - if( ABS(*dd2) <= RGAMSQ ) - { - while (ABS(*dd2) <= RGAMSQ) { - *dd2 = *dd2 * (GAM * GAM); - dh21 = dh21 / GAM; - dh22 = dh22 / GAM; - } - } - else - { - while (ABS(*dd2) >= GAMSQ) { - *dd2 = *dd2 / (GAM * GAM); - dh21 = dh21 * GAM; - dh22 = dh22 * GAM; - } - } - } + while (ABS(*dd2) <= RGAMSQ && *dd2 != ZERO) { + dflag = -ONE; + *dd2 = *dd2 * (GAM * GAM); + dh21 = dh21 / GAM; + dh22 = dh22 / GAM; + } + while (ABS(*dd2) > GAMSQ) { + dflag = -ONE; + *dd2 = *dd2 / (GAM * GAM); + dh21 = dh21 * GAM; + dh22 = dh22 * GAM; } } diff --git a/utest/test_rotmg.c b/utest/test_rotmg.c index 37aba84b3..e5ec78983 100644 --- a/utest/test_rotmg.c +++ b/utest/test_rotmg.c @@ -53,7 +53,7 @@ CTEST (drotmg,rotmg) te_param[i]=tr_param[i]=0.0; } - //reference values as calulated by netlib blas + //reference values as calculated by netlib blas tr_d1= 0.1732048; tr_d2= 0.03840234; @@ -71,13 +71,13 @@ CTEST (drotmg,rotmg) tr_param[4]= 0.0; BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); - ASSERT_DBL_NEAR_TOL(te_d1, tr_d1, DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(te_d2, tr_d2, DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(te_x1, tr_x1, DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(te_y1, tr_y1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); for(i=0; i<5; i++){ - ASSERT_DBL_NEAR_TOL(te_param[i], tr_param[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); } } @@ -91,7 +91,7 @@ CTEST (drotmg,rotmg_issue1452) double tr_param[5]; int i=0; - // from issue #1452, buggy version returned 0.000244 for param[3] + // from issue #1452 te_d1 = 5.9e-8; te_d2 = 5.960464e-8; te_x1 = 1.0; @@ -100,8 +100,8 @@ CTEST (drotmg,rotmg_issue1452) for(i=0; i<5; i++){ te_param[i]=tr_param[i]=0.0; } - - //reference values as calulated by netlib blas + te_param[3]=1./4096.; + //reference values as calculated by gonum blas with rotmg rewritten to Hopkins' algorithm tr_d1= 0.99995592822897; tr_d2= 0.98981219860583; tr_x1= 0.03662270484346; @@ -110,19 +110,19 @@ CTEST (drotmg,rotmg_issue1452) tr_param[0]= -1.0; tr_param[1]= 0.00000161109346; tr_param[2]= -0.00024414062500; - tr_param[3]= 1.0; + tr_param[3]= 0.00024414062500; tr_param[4]= 0.00000162760417; //OpenBLAS BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); - ASSERT_DBL_NEAR_TOL(te_d1, tr_d1, DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(te_d2, tr_d2, DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(te_x1, tr_x1, DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(te_y1, tr_y1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); for(i=0; i<5; i++){ - ASSERT_DBL_NEAR_TOL(te_param[i], tr_param[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); } } @@ -145,7 +145,7 @@ CTEST(drotmg, rotmg_D1eqD2_X1eqX2) te_param[i]=tr_param[i]=0.0; } - //reference values as calulated by netlib blas + //reference values as calculated by netlib blas tr_d1= 1.0; tr_d2= 1.0; tr_x1= 16.0; @@ -160,12 +160,47 @@ CTEST(drotmg, rotmg_D1eqD2_X1eqX2) //OpenBLAS BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); - ASSERT_DBL_NEAR_TOL(te_d1, tr_d1, DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(te_d2, tr_d2, DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(te_x1, tr_x1, DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(te_y1, tr_y1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); for(i=0; i<5; i++){ - ASSERT_DBL_NEAR_TOL(te_param[i], tr_param[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); + } +} + +CTEST(drotmg, drotmg_D1_big_D2_big_flag_zero) +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]={1.,4096.,-4096.,1.,4096.}; + double tr_param[5]={-1.,4096.,-3584.,1792.,4096.}; + int i=0; + te_d1= tr_d1=1600000000.; + te_d2= tr_d2=800000000.; + te_x1= tr_x1=8.; + te_y1= tr_y1=7.; + + + //reference values as calculated by gonum + tr_d1= 68.96627824858757; + tr_d2= 34.483139124293785; + tr_x1= 45312.; + tr_y1= 7.0; + + + //OpenBLAS + BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); } } diff --git a/utest/utest_main2.c b/utest/utest_main2.c index bcaa43ec0..aa95a5a3f 100644 --- a/utest/utest_main2.c +++ b/utest/utest_main2.c @@ -50,14 +50,15 @@ CTEST(amax, samax){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); } -CTEST (drotmg,rotmg){ +CTEST (drotmg,rotmg) +{ double te_d1, tr_d1; double te_d2, tr_d2; double te_x1, tr_x1; double te_y1, tr_y1; double te_param[5]; double tr_param[5]; - blasint i=0; + int i=0; // original test case for libGoto bug fixed by feb2014 rewrite te_d1= 0.21149573940783739; te_d2= 0.046892057172954082; @@ -69,7 +70,7 @@ CTEST (drotmg,rotmg){ te_param[i]=tr_param[i]=0.0; } - //reference values as calulated by netlib blas + //reference values as calculated by netlib blas tr_d1= 0.1732048; tr_d2= 0.03840234; @@ -87,26 +88,27 @@ CTEST (drotmg,rotmg){ tr_param[4]= 0.0; BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); - ASSERT_DBL_NEAR_TOL(te_d1, tr_d1, DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(te_d2, tr_d2, DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(te_x1, tr_x1, DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(te_y1, tr_y1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); for(i=0; i<5; i++){ - ASSERT_DBL_NEAR_TOL(te_param[i], tr_param[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); } } -CTEST (drotmg,rotmg_issue1452){ +CTEST (drotmg,rotmg_issue1452) +{ double te_d1, tr_d1; double te_d2, tr_d2; double te_x1, tr_x1; double te_y1, tr_y1; double te_param[5]; double tr_param[5]; - blasint i=0; + int i=0; - // from issue #1452, buggy version returned 0.000244 for param[3] + // from issue #1452 te_d1 = 5.9e-8; te_d2 = 5.960464e-8; te_x1 = 1.0; @@ -115,8 +117,8 @@ CTEST (drotmg,rotmg_issue1452){ for(i=0; i<5; i++){ te_param[i]=tr_param[i]=0.0; } - - //reference values as calulated by netlib blas + te_param[3]=1./4096.; + //reference values as calculated by gonum blas with rotmg rewritten to Hopkins' algorithm tr_d1= 0.99995592822897; tr_d2= 0.98981219860583; tr_x1= 0.03662270484346; @@ -125,31 +127,32 @@ CTEST (drotmg,rotmg_issue1452){ tr_param[0]= -1.0; tr_param[1]= 0.00000161109346; tr_param[2]= -0.00024414062500; - tr_param[3]= 1.0; + tr_param[3]= 0.00024414062500; tr_param[4]= 0.00000162760417; //OpenBLAS BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); - ASSERT_DBL_NEAR_TOL(te_d1, tr_d1, DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(te_d2, tr_d2, DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(te_x1, tr_x1, DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(te_y1, tr_y1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); for(i=0; i<5; i++){ - ASSERT_DBL_NEAR_TOL(te_param[i], tr_param[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); } } -CTEST(drotmg, rotmg_D1eqD2_X1eqX2){ +CTEST(drotmg, rotmg_D1eqD2_X1eqX2) +{ double te_d1, tr_d1; double te_d2, tr_d2; double te_x1, tr_x1; double te_y1, tr_y1; double te_param[5]; double tr_param[5]; - blasint i=0; + int i=0; te_d1= tr_d1=2.; te_d2= tr_d2=2.; te_x1= tr_x1=8.; @@ -159,7 +162,7 @@ CTEST(drotmg, rotmg_D1eqD2_X1eqX2){ te_param[i]=tr_param[i]=0.0; } - //reference values as calulated by netlib blas + //reference values as calculated by netlib blas tr_d1= 1.0; tr_d2= 1.0; tr_x1= 16.0; @@ -174,13 +177,48 @@ CTEST(drotmg, rotmg_D1eqD2_X1eqX2){ //OpenBLAS BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); - ASSERT_DBL_NEAR_TOL(te_d1, tr_d1, DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(te_d2, tr_d2, DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(te_x1, tr_x1, DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(te_y1, tr_y1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); for(i=0; i<5; i++){ - ASSERT_DBL_NEAR_TOL(te_param[i], tr_param[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); + } +} + +CTEST(drotmg, drotmg_D1_big_D2_big_flag_zero) +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]={1.,4096.,-4096.,1.,4096.}; + double tr_param[5]={-1.,4096.,-3584.,1792.,4096.}; + int i=0; + te_d1= tr_d1=1600000000.; + te_d2= tr_d2=800000000.; + te_x1= tr_x1=8.; + te_y1= tr_y1=7.; + + + //reference values as calculated by gonum + tr_d1= 68.96627824858757; + tr_d2= 34.483139124293785; + tr_x1= 45312.; + tr_y1= 7.0; + + + //OpenBLAS + BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); } } @@ -199,8 +237,8 @@ CTEST(axpy,daxpy_inc_0) BLASFUNC(daxpy)(&N,&a,x1,&incX,y1,&incY); for(i=0; i Date: Sun, 4 Mar 2018 19:37:03 +0100 Subject: [PATCH 03/23] Re-enable DAXPY microkernels for x86_64 as the inaccuracies seen in the original testcase for #1332 appear to be due to an artefact that amplifies the very small rounding differences between FMA and discrete multiply+add --- kernel/x86_64/daxpy.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index 20075b815..4bde62824 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -33,17 +33,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_nehalem-2.c" #elif defined(BULLDOZER) #include "daxpy_microk_bulldozer-2.c" -/* -these appear to be broken, see issue 1332 #elif defined(STEAMROLLER) || defined(EXCAVATOR) #include "daxpy_microk_steamroller-2.c" #elif defined(PILEDRIVER) #include "daxpy_microk_piledriver-2.c" #elif defined(HASWELL) || defined(ZEN) #include "daxpy_microk_haswell-2.c" -*/ -#elif defined(HASWELL) || defined(ZEN) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) -#include "daxpy_microk_sandy-2.c" #elif defined(SANDYBRIDGE) #include "daxpy_microk_sandy-2.c" #endif From 85a41e9cdb2cf798255e1984a5dfabacde1ab1a6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 16 Mar 2018 16:58:47 +0100 Subject: [PATCH 04/23] Add multithreading support for Haswell DDOT copied from ashwinyes' implementation in dot_thunderx2t99.c --- kernel/x86_64/ddot.c | 68 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 0a20564cf..7394e352e 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -43,6 +43,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ddot_microk_sandy-2.c" #endif +#if !defined(DSDOT) +#define RETURN_TYPE FLOAT +#else +#define RETURN_TYPE double +#endif + #ifndef HAVE_KERNEL_8 @@ -71,7 +77,7 @@ static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) #endif -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +FLOAT dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; @@ -139,4 +145,64 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) } +#if defined(SMP) +static int dot_thread_function(BLASLONG n, BLASLONG dummy0, + BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, RETURN_TYPE *result, BLASLONG dummy3) +{ + *(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y); + return 0; +} + +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, + BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, + void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha; +#endif + FLOAT dot = 0.0; + +#if defined(SMP) + nthreads = num_cpu_avail(1); + + if (inc_x == 0 || inc_y == 0) + nthreads = 1; + + if (n <= 10000) + nthreads = 1; + + if (nthreads == 1) { + dot = dot_compute(n, x, inc_x, y, inc_y); + } else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) * 2]; + RETURN_TYPE *ptr; + +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_REAL; +#else + mode = BLAS_DOUBLE | BLAS_REAL; +#endif +fprintf(stderr,"threaded ddot with %d threads\n",nthreads); + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, + x, inc_x, y, inc_y, result, 0, + ( void *)dot_thread_function, nthreads); + + ptr = (RETURN_TYPE *)result; + for (i = 0; i < nthreads; i++) { + dot = dot + (*ptr); + ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2); + } + } +#else + dot = dot_compute(n, x, inc_x, y, inc_y); +#endif + + return dot; +} From a55694dd5b879c4376fd3a386250ac8c941b6ef1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 16 Mar 2018 22:23:36 +0100 Subject: [PATCH 05/23] Declare dot_compute static to avoid conflicts in multiarch builds --- kernel/x86_64/ddot.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 7394e352e..8162a5d83 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -77,7 +77,7 @@ static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) #endif -FLOAT dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +static FLOAT dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; @@ -189,7 +189,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) #else mode = BLAS_DOUBLE | BLAS_REAL; #endif -fprintf(stderr,"threaded ddot with %d threads\n",nthreads); blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, y, inc_y, result, 0, ( void *)dot_thread_function, nthreads); From 28ac9ea5a6de2eeec434f887ba9e6f03d1350d64 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 17 Mar 2018 13:49:15 +0100 Subject: [PATCH 06/23] Use generic/dot.c instead of the inferior arm/dot.c for x86 DSDOT to resolve dsdot utest failure seen in #1492 --- kernel/x86/KERNEL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86/KERNEL b/kernel/x86/KERNEL index 39be2ef80..83b51db13 100644 --- a/kernel/x86/KERNEL +++ b/kernel/x86/KERNEL @@ -169,7 +169,7 @@ ifndef ZDOTKERNEL ZDOTKERNEL = ../arm/zdot.c endif -DSDOTKERNEL = ../arm/dot.c +DSDOTKERNEL = ../generic/dot.c # Bug in znrm2 assembler kernel ifndef ZNRM2KERNEL From e453555d97732f1691c0f07378486e10ab04cd86 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 19 Mar 2018 18:02:23 +0100 Subject: [PATCH 07/23] Disable CPU affinity by default again This setting must have been changed unintentionally by my PR #1214 (probably leftover from unrelated tests) --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 718f04090..62bf63df4 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -100,7 +100,7 @@ BUILD_LAPACK_DEPRECATED = 1 NO_WARMUP = 1 # If you want to disable CPU/Memory affinity on Linux. -#NO_AFFINITY = 1 +NO_AFFINITY = 1 # if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus # BIGNUMA = 1 From 28ca97015dff0aa008a4e49f4faad26ea3fc6b1f Mon Sep 17 00:00:00 2001 From: QWR QWR Date: Wed, 7 Mar 2018 10:01:03 -0500 Subject: [PATCH 08/23] power8:Added initial zgemv_(t|n) ,i(d|z)amax,i(d|z)amin,dgemv_t(transposed),zrot z13: improved zgemv_(t|n)_4,zscal,zaxpy --- kernel/power/KERNEL.POWER8 | 16 +- kernel/power/dgemv_t.c | 886 +++++++++++++++++++++++ kernel/power/idamax.c | 383 ++++++++++ kernel/power/idamin.c | 384 ++++++++++ kernel/power/izamax.c | 362 ++++++++++ kernel/power/izamin.c | 361 ++++++++++ kernel/power/zgemv_n_4.c | 958 +++++++++++++++++++++++++ kernel/power/zgemv_t_4.c | 847 ++++++++++++++++++++++ kernel/power/zrot.c | 265 +++++++ kernel/zarch/zaxpy.c | 216 +++--- kernel/zarch/zgemv_n_4.c | 1395 ++++++++++++++++++------------------ kernel/zarch/zgemv_t_4.c | 947 ++++++++++++------------ kernel/zarch/zscal.c | 171 +++-- 13 files changed, 5875 insertions(+), 1316 deletions(-) create mode 100644 kernel/power/dgemv_t.c create mode 100644 kernel/power/idamax.c create mode 100644 kernel/power/idamin.c create mode 100644 kernel/power/izamax.c create mode 100644 kernel/power/izamin.c create mode 100644 kernel/power/zgemv_n_4.c create mode 100644 kernel/power/zgemv_t_4.c create mode 100644 kernel/power/zrot.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 594abf795..00ff8682a 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -90,14 +90,14 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #DMINKERNEL = ../arm/min.c # #ISAMAXKERNEL = ../arm/iamax.c -#IDAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = idamax.c #ICAMAXKERNEL = ../arm/izamax.c -#IZAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = izamax.c # #ISAMINKERNEL = ../arm/iamin.c -#IDAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = idamin.c #ICAMINKERNEL = ../arm/izamin.c -#IZAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = izamin.c # #ISMAXKERNEL = ../arm/imax.c #IDMAXKERNEL = ../arm/imax.c @@ -134,7 +134,7 @@ ZNRM2KERNEL = ../arm/znrm2.c SROTKERNEL = srot.c DROTKERNEL = drot.c #CROTKERNEL = ../arm/zrot.c -#ZROTKERNEL = ../arm/zrot.c +ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c DSCALKERNEL = dscal.c @@ -150,12 +150,12 @@ ZSWAPKERNEL = zswap.c #SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = dgemv_n.c #CGEMVNKERNEL = ../arm/zgemv_n.c -#ZGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = zgemv_n_4.c # #SGEMVTKERNEL = ../arm/gemv_t.c -#DGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = dgemv_t.c #CGEMVTKERNEL = ../arm/zgemv_t.c -#ZGEMVTKERNEL = zgemv_t_4.c +ZGEMVTKERNEL = zgemv_t_4.c #SSYMV_U_KERNEL = ../generic/symv_k.c diff --git a/kernel/power/dgemv_t.c b/kernel/power/dgemv_t.c new file mode 100644 index 000000000..3974ed62d --- /dev/null +++ b/kernel/power/dgemv_t.c @@ -0,0 +1,886 @@ +/*************************************************************************** +Copyright (c) 2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "common.h" + +#define NBMAX 8192 +#define PREFETCH 1 +#include + +#define HAVE_KERNEL4x8_ASM 1 + + +#if defined(HAVE_KERNEL4x8_ASM) +static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) { + + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + BLASLONG off2; + BLASLONG tempR; + __asm__( + + "sldi %[temp],%[off], 4 \n\t" // lda * sizeof (double) *2 + "sldi %[off], %[off], 3 \n\t" // lda * sizeof (double) + "xxlxor 34,34,34 \n\t" + "xxlxor 35,34,34 \n\t" + "add %[a2], %[a0], %[temp] \n\t" + "add %[a1], %[a0], %[off] \n\t" + "xxlxor 4,34,34 \n\t" + "xxlxor 5,34,34 \n\t" + "xxlxor 6,34,34 \n\t" + "xxlxor 7,34,34 \n\t" + "add %[a3], %[a2], %[off] \n\t" + "add %[a4], %[a2], %[temp] \n\t" + + "xxlxor 8,34,34 \n\t" + "xxlxor 9,34,34 \n\t" + "add %[a5], %[a3], %[temp] \n\t" + "li %[off],0 \n\t" + "li %[off2],16 \n\t" + + "add %[a6], %[a4], %[temp] \n\t" + "add %[a7], %[a5], %[temp] \n\t" + + + + + "lxvd2x 32, %[x], %[off] \n\t" + "lxvd2x 36, %[a0], %[off] \n\t" + "lxvd2x 38, %[a1], %[off] \n\t" + "lxvd2x 40, %[a2], %[off] \n\t" + "lxvd2x 42, %[a3], %[off] \n\t" + "lxvd2x 44, %[a4], %[off] \n\t" + "lxvd2x 46, %[a5], %[off] \n\t" + "lxvd2x 48, %[a6], %[off] \n\t" + "lxvd2x 50, %[a7], %[off] \n\t" + "lxvd2x 33, %[x], %[off2] \n\t" + "lxvd2x 37, %[a0], %[off2] \n\t" + "lxvd2x 39, %[a1], %[off2] \n\t" + "lxvd2x 41, %[a2], %[off2] \n\t" + "lxvd2x 43, %[a3], %[off2] \n\t" + "lxvd2x 45, %[a4], %[off2] \n\t" + "lxvd2x 47, %[a5], %[off2] \n\t" + "lxvd2x 49, %[a6], %[off2] \n\t" + "lxvd2x 51, %[a7], %[off2] \n\t" +#if defined(PREFETCH) + "li %[temp],896 \n\t" +#endif + "addic. %[n],%[n],-4 \n\t" + + "li %[off],32 \n\t" + + + "ble- 2f \n\t" + + //-------------------------------------------------- + ".p2align 5 \n\t" + "1: \n\t" + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" + "addi %[off2], %[off2],32 \n\t" + "lxvd2x 36, %[a0], %[off] \n\t" + "lxvd2x 38, %[a1], %[off] \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" + "lxvd2x 40, %[a2], %[off] \n\t" + "lxvd2x 42, %[a3], %[off] \n\t" + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "lxvd2x 44, %[a4], %[off] \n\t" + "lxvd2x 46, %[a5], %[off] \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" + "lxvd2x 48, %[a6], %[off] \n\t" + "lxvd2x 50, %[a7], %[off] \n\t" + "lxvd2x 32, %[x], %[off] \n\t" + + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "lxvd2x 37, %[a0], %[off2] \n\t" + "lxvd2x 39, %[a1], %[off2] \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" + "addi %[off], %[off],32 \n\t" + "lxvd2x 41, %[a2], %[off2] \n\t" + "lxvd2x 43, %[a3], %[off2] \n\t" + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "lxvd2x 45, %[a4], %[off2] \n\t" + "lxvd2x 47, %[a5], %[off2] \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" + + "addic. %[n],%[n],-4 \n\t" + "lxvd2x 49, %[a6], %[off2] \n\t" + "lxvd2x 51, %[a7], %[off2] \n\t" + "lxvd2x 33, %[x], %[off2] \n\t" + "ble- 2f \n\t" + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" + "addi %[off2], %[off2],32 \n\t" + "lxvd2x 36, %[a0], %[off] \n\t" + "lxvd2x 38, %[a1], %[off] \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" + "lxvd2x 40, %[a2], %[off] \n\t" + "lxvd2x 42, %[a3], %[off] \n\t" + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "lxvd2x 44, %[a4], %[off] \n\t" + "lxvd2x 46, %[a5], %[off] \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" + "lxvd2x 48, %[a6], %[off] \n\t" + "lxvd2x 50, %[a7], %[off] \n\t" + "lxvd2x 32, %[x], %[off] \n\t" + + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "lxvd2x 37, %[a0], %[off2] \n\t" + "lxvd2x 39, %[a1], %[off2] \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" + "addi %[off], %[off],32 \n\t" + "lxvd2x 41, %[a2], %[off2] \n\t" + "lxvd2x 43, %[a3], %[off2] \n\t" + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "lxvd2x 45, %[a4], %[off2] \n\t" + "lxvd2x 47, %[a5], %[off2] \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" + + "addic. %[n],%[n],-4 \n\t" + "lxvd2x 49, %[a6], %[off2] \n\t" + "lxvd2x 51, %[a7], %[off2] \n\t" + "lxvd2x 33, %[x], %[off2] \n\t" + "ble- 2f \n\t" + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" +#if defined(PREFETCH) + "addi %[temp],%[temp],128 \n\t" +#endif + "addi %[off2], %[off2],32 \n\t" + "lxvd2x 36, %[a0], %[off] \n\t" + "lxvd2x 38, %[a1], %[off] \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" + "lxvd2x 40, %[a2], %[off] \n\t" + "lxvd2x 42, %[a3], %[off] \n\t" + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "lxvd2x 44, %[a4], %[off] \n\t" + "lxvd2x 46, %[a5], %[off] \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" + "lxvd2x 48, %[a6], %[off] \n\t" + "lxvd2x 50, %[a7], %[off] \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a0] \n\t" +#endif + "lxvd2x 32, %[x], %[off] \n\t" + + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "lxvd2x 37, %[a0], %[off2] \n\t" + "lxvd2x 39, %[a1], %[off2] \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a1] \n\t" +#endif + "lxvd2x 41, %[a2], %[off2] \n\t" + "addi %[off], %[off],32 \n\t" + "lxvd2x 43, %[a3], %[off2] \n\t" + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "lxvd2x 45, %[a4], %[off2] \n\t" + "lxvd2x 47, %[a5], %[off2] \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a3] \n\t" +#endif + "lxvd2x 49, %[a6], %[off2] \n\t" + "lxvd2x 51, %[a7], %[off2] \n\t" + + "lxvd2x 33, %[x], %[off2] \n\t" + "addic. %[n],%[n],-4 \n\t" + "ble- 2f \n\t" + + "addi %[off2], %[off2],32 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a2] \n\t" +#endif + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" + "lxvd2x 36, %[a0], %[off] \n\t" + "lxvd2x 38, %[a1], %[off] \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" + "lxvd2x 40, %[a2], %[off] \n\t" + "lxvd2x 42, %[a3], %[off] \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a4] \n\t" +#endif + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "lxvd2x 44, %[a4], %[off] \n\t" + "lxvd2x 46, %[a5], %[off] \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" + "lxvd2x 48, %[a6], %[off] \n\t" + "lxvd2x 50, %[a7], %[off] \n\t" + "lxvd2x 32, %[x], %[off] \n\t" + +#if defined(PREFETCH) + "dcbt %[temp],%[a5] \n\t" +#endif + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "lxvd2x 37, %[a0], %[off2] \n\t" + "lxvd2x 39, %[a1], %[off2] \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" + "addi %[off], %[off],32 \n\t" + "lxvd2x 41, %[a2], %[off2] \n\t" + "lxvd2x 43, %[a3], %[off2] \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a6] \n\t" +#endif + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "lxvd2x 45, %[a4], %[off2] \n\t" + "lxvd2x 47, %[a5], %[off2] \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" + +#if defined(PREFETCH) + "dcbt %[temp],%[a7] \n\t" +#endif + "lxvd2x 49, %[a6], %[off2] \n\t" + "addic. %[n],%[n],-4 \n\t" + "lxvd2x 51, %[a7], %[off2] \n\t" + "lxvd2x 33, %[x], %[off2] \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[x] \n\t" +#endif + "bgt+ 1b \n\t" + ".p2align 5 \n\t" + "2: \n\t" + //-------------------------------------------- + + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" + "xxspltd 36, %x[alpha], 0 \n\t" + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" + + "lxvd2x 37, 0, %[y] \n\t" + "li %[off2],16 \n\t" + "lxvd2x 38, %[off2], %[y] \n\t" + + "li %[off2],32 \n\t" + "lxvd2x 39, %[off2], %[y] \n\t" + "li %[off2],48 \n\t" + "lxvd2x 40, %[off2], %[y] \n\t" + + + + "xxmrgld 42,34,35 \n\t" + "xxmrghd 43,34,35 \n\t" + + "xxmrgld 44,4,5 \n\t" + "xxmrghd 45,4,5 \n\t" + + "xvadddp 42,42,43 \n\t" + + "xxmrgld 46,6,7 \n\t" + "xxmrghd 47,6,7 \n\t" + + "xvadddp 44,44,45 \n\t" + + "xxmrgld 48,8,9 \n\t" + "xxmrghd 49,8,9 \n\t" + + "xvadddp 46,46,47 \n\t" + + "xvmaddadp 37,42,36 \n\t" + "xvmaddadp 38,44,36 \n\t" + + "xvadddp 48,48,49 \n\t" + + "xvmaddadp 39,46,36 \n\t" + + "stxvd2x 37, 0, %[y] \n\t" + "li %[off],16 \n\t" + "stxvd2x 38, %[off], %[y] \n\t" + "xvmaddadp 40,48,36 \n\t" + "li %[off],32 \n\t" + "stxvd2x 39, %[off], %[y] \n\t" + "stxvd2x 40, %[off2], %[y] \n\t" + + : [memy] "+m" (*(const double (*)[8])y), + [n] "+&r" (n), + [a0] "=b" (a0), + [a1] "=&b" (a1), + [a2] "=&b" (a2), + [a3] "=&b" (a3), + [a4] "=&b" (a4), + [a5] "=&b" (a5), + [a6] "=&b" (a6), + [a7] "=&b" (a7), + [off] "+&b" (lda), + [off2]"=&b" (off2), + [temp] "=&b" (tempR) + : [memx] "m" (*(const double (*)[n])x), + [mem_ap] "m" (*(const double (*)[]) ap), + [alpha] "d" (alpha), + "[a0]" (ap), + [x] "b" (x), + [y] "b" (y) + : "cc","vs4","vs5","vs6","vs7","vs8","vs9" ,"vs32","vs33","vs34","vs35", "vs36", "vs37", "vs38", "vs39", + "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" + ); + return; +} +#else +static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i; +#if defined(PREFETCH) + BLASLONG j, c, k; +#endif + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + __vector double *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; + register __vector double temp0 = {0, 0}; + register __vector double temp1 = {0, 0}; + register __vector double temp2 = {0, 0}; + register __vector double temp3 = {0, 0}; + register __vector double temp4 = {0, 0}; + register __vector double temp5 = {0, 0}; + register __vector double temp6 = {0, 0}; + register __vector double temp7 = {0, 0}; + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + va0 = (__vector double*) a0; + va1 = (__vector double*) a1; + va2 = (__vector double*) a2; + va3 = (__vector double*) a3; + va4 = (__vector double*) a4; + va5 = (__vector double*) a5; + va6 = (__vector double*) a6; + va7 = (__vector double*) a7; + v_x = (__vector double*) x; + +#if defined(PREFETCH) + + c = n >> 1; + + for (j = 0; j < c; j += 64) { + k = (c - j) > 64 ? 64 : (c - j); + __builtin_prefetch(v_x + 64); + __builtin_prefetch(va0 + 64); + __builtin_prefetch(va1 + 64); + __builtin_prefetch(va2 + 64); + __builtin_prefetch(va3 + 64); + __builtin_prefetch(va4 + 64); + __builtin_prefetch(va5 + 64); + __builtin_prefetch(va6 + 64); + __builtin_prefetch(va7 + 64); + for (i = 0; i < k; i += 2) { +#else + + for (i = 0; i < n/2; i += 2) { +#endif + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + temp2 += v_x[i] * va2[i]; + temp3 += v_x[i] * va3[i]; + temp4 += v_x[i] * va4[i]; + temp5 += v_x[i] * va5[i]; + temp6 += v_x[i] * va6[i]; + temp7 += v_x[i] * va7[i]; + temp0 += v_x[i + 1] * va0[i + 1]; + temp1 += v_x[i + 1] * va1[i + 1]; + temp2 += v_x[i + 1] * va2[i + 1]; + temp3 += v_x[i + 1] * va3[i + 1]; + + temp4 += v_x[i + 1] * va4[i + 1]; + temp5 += v_x[i + 1] * va5[i + 1]; + temp6 += v_x[i + 1] * va6[i + 1]; + temp7 += v_x[i + 1] * va7[i + 1]; + } +#if defined(PREFETCH) + va0 += 64; + va1 += 64; + va2 += 64; + va3 += 64; + va4 += 64; + va5 += 64; + va6 += 64; + va7 += 64; + v_x += 64; + + } +#endif + y[0] += alpha * (temp0[0] + temp0[1]); + y[1] += alpha * (temp1[0] + temp1[1]); + y[2] += alpha * (temp2[0] + temp2[1]); + y[3] += alpha * (temp3[0] + temp3[1]); + + y[4] += alpha * (temp4[0] + temp4[1]); + y[5] += alpha * (temp5[0] + temp5[1]); + y[6] += alpha * (temp6[0] + temp6[1]); + y[7] += alpha * (temp7[0] + temp7[1]); + +} + +#endif + + +static void dgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i = 0; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector double* va0 = (__vector double*) a0; + __vector double* va1 = (__vector double*) a1; + __vector double* va2 = (__vector double*) a2; + __vector double* va3 = (__vector double*) a3; + __vector double* v_x = (__vector double*) x; + register __vector double temp0 = {0, 0}; + register __vector double temp1 = {0, 0}; + register __vector double temp2 = {0, 0}; + register __vector double temp3 = {0, 0}; + register __vector double temp4 = {0, 0}; + register __vector double temp5 = {0, 0}; + register __vector double temp6 = {0, 0}; + register __vector double temp7 = {0, 0}; + + for (i = 0; i < n / 2; i += 2) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + temp2 += v_x[i] * va2[i]; + temp3 += v_x[i] * va3[i]; + temp4 += v_x[i + 1] * va0[i + 1]; + temp5 += v_x[i + 1] * va1[i + 1]; + temp6 += v_x[i + 1] * va2[i + 1]; + temp7 += v_x[i + 1] * va3[i + 1]; + } + + temp0 += temp4; + temp1 += temp5; + temp2 += temp6; + temp3 += temp7; + y[0] += alpha * (temp0[0] + temp0[1]); + y[1] += alpha * (temp1[0] + temp1[1]); + y[2] += alpha * (temp2[0] + temp2[1]); + y[3] += alpha * (temp3[0] + temp3[1]); + +} + + +static void dgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { + + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector double* va0 = (__vector double*) a0; + __vector double* va1 = (__vector double*) a1; + __vector double* v_x = (__vector double*) x; + __vector double temp0 = {0, 0}; + __vector double temp1 = {0, 0}; + for (i = 0; i < n / 2; i += 2) { + temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1]; + temp1 += v_x[i] * va1[i] + v_x[i + 1] * va1[i + 1]; + } + + + + y[0] += alpha * (temp0[0] + temp0[1]); + y[inc_y] += alpha * (temp1[0] + temp1[1]); +} + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + + BLASLONG i; + FLOAT *a0; + a0 = ap; + __vector double* va0 = (__vector double*) a0; + __vector double* v_x = (__vector double*) x; + __vector double temp0 = {0, 0}; + for (i = 0; i < n / 2; i += 2) { + temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1]; + } + + *y += alpha * (temp0[0] + temp0[1]); + +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest++ = *src; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + + FLOAT ybuffer[8], *xbuffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + xbuffer = buffer; + + n1 = n >> 3; + n2 = n & 7; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 1) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + BLASLONG lda8 = lda << 3; + + + if (inc_y == 1) { + + for (i = 0; i < n1; i++) { + + dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); + + y_ptr += 8; + a_ptr += lda8; +#if defined(PREFETCH) + __builtin_prefetch(y_ptr+64); +#endif + } + + } else { + + for (i = 0; i < n1; i++) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + ybuffer[4] = 0; + ybuffer[5] = 0; + ybuffer[6] = 0; + ybuffer[7] = 0; + dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + + *y_ptr += ybuffer[4]; + y_ptr += inc_y; + *y_ptr += ybuffer[5]; + y_ptr += inc_y; + *y_ptr += ybuffer[6]; + y_ptr += inc_y; + *y_ptr += ybuffer[7]; + y_ptr += inc_y; + + a_ptr += lda8; + } + + } + + + if (n2 & 4) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + dgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + a_ptr += lda<<2; + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + } + + if (n2 & 2) { + dgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); + a_ptr += lda << 1; + y_ptr += 2 * inc_y; + + } + + if (n2 & 1) { + dgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); + a_ptr += lda; + y_ptr += inc_y; + + } + + a += NB; + x += NB * inc_x; + + + } + + if (m3 == 0) return (0); + + x_ptr = x; + a_ptr = a; + if (m3 == 3) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 3 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + aj += 3; + } + + } else { + + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + aj += lda; + } + + } else { + + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr += inc_y; + aj += lda; + } + + } + + } + return (0); + } + + if (m3 == 2) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 2 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; + y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; + y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; + aj += 8; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + aj += 2; + } + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr += inc_y; + aj += lda; + } + } + + } + return (0); + + } + + FLOAT xtemp = *x_ptr * alpha; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 1 && inc_y == 1) { + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[j] * xtemp; + y_ptr[j + 1] += aj[j + 1] * xtemp; + y_ptr[j + 2] += aj[j + 2] * xtemp; + y_ptr[j + 3] += aj[j + 3] * xtemp; + } + for (; j < n; j++) { + y_ptr[j] += aj[j] * xtemp; + } + + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += *aj * xtemp; + y_ptr[j + 1] += *(aj + lda) * xtemp; + y_ptr[j + 2] += *(aj + lda2) * xtemp; + y_ptr[j + 3] += *(aj + lda3) * xtemp; + aj += lda4; + } + + for (; j < n; j++) { + y_ptr[j] += *aj * xtemp; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp; + y_ptr += inc_y; + aj += lda; + } + + } + } + + return (0); + +} + diff --git a/kernel/power/idamax.c b/kernel/power/idamax.c new file mode 100644 index 000000000..5bdc0a13c --- /dev/null +++ b/kernel/power/idamax.c @@ -0,0 +1,383 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "common.h" +#include +#include +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +/** + * Find maximum index + * Warning: requirements n>0 and n % 32 == 0 + * @param n + * @param x pointer to the vector + * @param maxf (out) maximum absolute value .( only for output ) + * @return index + */ +static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { + BLASLONG index; + register __vector long long start = {1,0}; + register __vector long long temp_add_index = {2, 2}; + __asm__( + + + "lxvd2x 44, 0,%[ptr_tmp] \n\t" + "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" + "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" + "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" + "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" + "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" + "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + + "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 + "vaddudm 9,8,%[adder] \n\t" //{3,2} vs41 + "xxlxor 37,37 ,37 \n\t" //v5 v37 index_count + "vaddudm 10,9,%[adder] \n\t" //{5,4} vs42 + "xxlxor 38 ,38 ,38 \n\t" // v6 | vs38 vec_max_index + "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 + "xxlxor 39,39,39 \n\t" // vs39 vec_max_value + "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 + "xxspltd 36,36,0 \n\t" + + "xvabsdp 44, 44 \n\t" + "xvabsdp 45, 45 \n\t" + "xvabsdp 46, 46 \n\t" + "xvabsdp 47, 47 \n\t" + "xvabsdp 48, 48 \n\t" + "xvabsdp 49, 49 \n\t" + "xvabsdp 50, 50 \n\t" + "xvabsdp 51, 51 \n\t" + + //jump first half forward + "b 2f \n\t" + +//=================================================================== + + ".p2align 5 \n\t" + + "1: \n\t" + "xvcmpgtdp 2,45,44 \n\t " + "xvcmpgtdp 3,47,46 \n\t " + "xvcmpgtdp 4,49,48 \n\t " + "xvcmpgtdp 5,51,50 \n\t" + + "xxsel 32,40,41,2 \n\t" + "xxsel 0,44,45,2 \n\t" + "xxsel 33,42,43,3 \n\t" + "xxsel 1,46,47,3 \n\t" + "xxsel 34,40,41,4 \n\t" + "xxsel 45,48,49,4 \n\t" + "xxsel 35,42,43,5 \n\t" + "xxsel 47,50,51,5 \n\t" + + "xvcmpgtdp 2, 1,0 \n\t" + "xvcmpgtdp 3,47, 45 \n\t" + + "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" + + "xxsel 32,32,33,2 \n\t" + "xxsel 0 ,0,1,2 \n\t" + "xxsel 34,34,35,3 \n\t" + "xxsel 5,45,47,3 \n\t" + + + //load next 64 + "lxvd2x 44, 0,%[ptr_tmp] \n\t" + "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" + + // for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16} + "vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8} + + "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" + "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" + + //choose bigger from first and second part + "xvcmpgtdp 4,5 , 0 \n\t" + "xxsel 3, 0,5,4 \n\t" + "xxsel 33,32,34,4 \n\t" + + //load next 64 + "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" + "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" + + "vaddudm 1,1,5 \n\t" // get real index for first bigger + + "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + + //compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39) + "xvcmpgtdp 2, 3,39 \n\t" + "xxsel 39,39,3,2 \n\t" + "xxsel 38,38,33,2 \n\t" + + //update index += 8 + "vaddudm 5,5,4 \n\t" + + "xvabsdp 44, 44 \n\t" + "xvabsdp 45, 45 \n\t" + "xvabsdp 46, 46 \n\t" + "xvabsdp 47, 47 \n\t" + + //update index += 8 + "vaddudm 5,5,4 \n\t" + + "xvabsdp 48, 48 \n\t" + "xvabsdp 49, 49 \n\t" + "xvabsdp 50, 50 \n\t" + "xvabsdp 51, 51 \n\t" + +//<-----------jump here from first load + "2: \n\t" + + "xvcmpgtdp 2,45,44 \n\t " + "xvcmpgtdp 3,47,46 \n\t " + "xvcmpgtdp 4,49,48 \n\t " + "xvcmpgtdp 5,51,50 \n\t" + + "xxsel 32,40,41,2 \n\t" + "xxsel 0,44,45,2 \n\t" + "xxsel 33,42,43,3 \n\t" + "xxsel 1,46,47,3 \n\t" + "xxsel 34,40,41,4 \n\t" + "xxsel 45,48,49,4 \n\t" + "xxsel 35,42,43,5 \n\t" + "xxsel 47,50,51,5 \n\t" + + "xvcmpgtdp 2, 1,0 \n\t" + "xvcmpgtdp 3,47, 45 \n\t" + "xxsel 32,32,33,2 \n\t" + "xxsel 0 ,0,1,2 \n\t" + "xxsel 34,34,35,3 \n\t" + "xxsel 5,45,47,3 \n\t" + + "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" + // for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16} + "vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8} + + //load next 64 + "lxvd2x 44, 0,%[ptr_tmp] \n\t" + "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" + "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" + "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" + + //choose bigger from first and second part + "xvcmpgtdp 4,5 , 0 \n\t" + "xxsel 3, 0,5,4 \n\t" + "xxsel 33,32,34,4 \n\t" + + //load next 64 + "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" + "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" + + "vaddudm 1,1,5 \n\t" // get real index for first bigger + + "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + + + + //compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39) + "xvcmpgtdp 2, 3,39 \n\t" + "xxsel 39,39,3,2 \n\t" + "xxsel 38,38,33,2 \n\t" + + //update index += 8 + "vaddudm 5,5,4 \n\t" + + "xvabsdp 44, 44 \n\t" + "xvabsdp 45, 45 \n\t" + "xvabsdp 46, 46 \n\t" + "xvabsdp 47, 47 \n\t" + + //update index += 8 + "vaddudm 5,5,4 \n\t" + + "xvabsdp 48, 48 \n\t" + "xvabsdp 49, 49 \n\t" + "xvabsdp 50, 50 \n\t" + "xvabsdp 51, 51 \n\t" + + //decrement n + "addic. %[n], %[n], -32 \n\t" + + //Loop back if >0 + "bgt+ 1b \n\t" + +//============================================================================== + + "xvcmpgtdp 2,45,44 \n\t " + "xvcmpgtdp 3,47,46 \n\t " + "xvcmpgtdp 4,49,48 \n\t " + "xvcmpgtdp 5,51,50 \n\t" + + "xxsel 32,40,41,2 \n\t" + "xxsel 0,44,45,2 \n\t" + "xxsel 33,42,43,3 \n\t" + "xxsel 1,46,47,3 \n\t" + "xxsel 34,40,41,4 \n\t" + "xxsel 45,48,49,4 \n\t" + "xxsel 35,42,43,5 \n\t" + "xxsel 47,50,51,5 \n\t" + + "xvcmpgtdp 2, 1,0 \n\t" + "xvcmpgtdp 3,47, 45 \n\t" + + + "xxsel 32,32,33,2 \n\t" + "xxsel 0 ,0,1,2 \n\t" + "xxsel 34,34,35,3 \n\t" + "xxsel 5,45,47,3 \n\t" + + // for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16} + "vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8} + //choose bigger from first and second part + "xvcmpgtdp 4,5 , 0 \n\t" + "xxsel 3, 0,5,4 \n\t" + "xxsel 33,32,34,4 \n\t" + + "vaddudm 1,1,5 \n\t" // get real index for first bigger + + //compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39) + "xvcmpgtdp 2, 3,39 \n\t" + "xxsel 39,39,3,2 \n\t" + "xxsel 38,38,33,2 \n\t" + + ///////extract max value and max index from vector + + "xxspltd 32,38,1 \n\t" + "xxspltd 40,39,1 \n\t" + "xvcmpeqdp. 2, 40,39 \n\t" + + //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely + //0b001110=14 + "bc 14,24, 3f \n\t" + "xvcmpgtdp 4, 40,39 \n\t" + "xxsel 0,39,40,4 \n\t" + "xxsel 1,38,32,4 \n\t" + "stxsdx 0,0,%[ptr_maxf] \n\t" + "b 4f \n\t" + + "3: \n\t" + //if elements value are equal then choose minimum index + "xxspltd 0,40,0 \n\t" + "vminud 0,0,6 \n\t" //vs32 vs38 + "xxlor 1,32,32 \n\t" + "stxsdx 0,0,%[ptr_maxf] \n\t" + + + "4: \n\t" + "mfvsrd %[index],1 \n\t" + + : [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) + : [mem] "m"(*(const double (*)[n])x), [ptr_x] "b"(x), [ptr_maxf] "b"(maxf) , + [i16] "b"(16), [i32] "b"(32), [i48] "b"(48), + [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), + [start] "v"(start), [adder] "v"(temp_add_index) + : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", + "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" + ); + + + return index; + +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + max = diamax_kernel_32(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/power/idamin.c b/kernel/power/idamin.c new file mode 100644 index 000000000..f4d1d1bdb --- /dev/null +++ b/kernel/power/idamin.c @@ -0,0 +1,384 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +/** + * Find minimum index + * Warning: requirements n>0 and n % 32 == 0 + * @param n + * @param x pointer to the vector + * @param minf (out) minimum absolute value .( only for output ) + * @return minimum index + */ +static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { + BLASLONG index; + register __vector long long start = {1,0}; + register __vector long long temp_add_index = {2, 2}; + __asm__( + + + "lxvd2x 44, 0,%[ptr_tmp] \n\t" + "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" + "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" + "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" + "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" + "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" + "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + + "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 + "vaddudm 9,8, %[adder] \n\t" //{3,2} vs41 + "xxlxor 37,37 ,37 \n\t" //v5 v37 index_count + "vaddudm 10,9,%[adder] \n\t" //{5,4} vs42 + "xxlxor 38 ,38 ,38 \n\t" // v6 | vs38 vec_min_index + "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 + "lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value + "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 + "xxspltd 36,36,0 \n\t" + "xvabsdp 39, 39 \n\t" + + "xvabsdp 44, 44 \n\t" + "xvabsdp 45, 45 \n\t" + "xvabsdp 46, 46 \n\t" + "xvabsdp 47, 47 \n\t" + "xvabsdp 48, 48 \n\t" + "xvabsdp 49, 49 \n\t" + "xvabsdp 50, 50 \n\t" + "xvabsdp 51, 51 \n\t" + + //jump first half forward + "b 2f \n\t" + +//=================================================================== + + ".p2align 5 \n\t" + + "1: \n\t" + "xvcmpgedp 2,44,45 \n\t " + "xvcmpgedp 3,46,47 \n\t " + "xvcmpgedp 4,48,49 \n\t " + "xvcmpgedp 5,50,51 \n\t" + + "xxsel 32,40,41,2 \n\t" + "xxsel 0,44,45,2 \n\t" + "xxsel 33,42,43,3 \n\t" + "xxsel 1,46,47,3 \n\t" + "xxsel 34,40,41,4 \n\t" + "xxsel 45,48,49,4 \n\t" + "xxsel 35,42,43,5 \n\t" + "xxsel 47,50,51,5 \n\t" + + "xvcmpgedp 2,0, 1 \n\t" + "xvcmpgedp 3, 45,47 \n\t" + + "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" + + "xxsel 32,32,33,2 \n\t" + "xxsel 0 ,0,1,2 \n\t" + "xxsel 34,34,35,3 \n\t" + "xxsel 5,45,47,3 \n\t" + + + //load next 64 + "lxvd2x 44, 0,%[ptr_tmp] \n\t" + "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" + + // for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16} + "vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8} + + "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" + "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" + + //choose smaller from first and second part + "xvcmpgedp 4, 0,5 \n\t" + "xxsel 3, 0,5,4 \n\t" + "xxsel 33,32,34,4 \n\t" + + //load next 64 + "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" + "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" + + "vaddudm 1,1,5 \n\t" // get real index for first smaller + + "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + + //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) + "xvcmpgedp 2,39, 3 \n\t" + "xxsel 39,39,3,2 \n\t" + "xxsel 38,38,33,2 \n\t" + + //update index += 8 + "vaddudm 5,5,4 \n\t" + + "xvabsdp 44, 44 \n\t" + "xvabsdp 45, 45 \n\t" + "xvabsdp 46, 46 \n\t" + "xvabsdp 47, 47 \n\t" + + //update index += 8 + "vaddudm 5,5,4 \n\t" + + "xvabsdp 48, 48 \n\t" + "xvabsdp 49, 49 \n\t" + "xvabsdp 50, 50 \n\t" + "xvabsdp 51, 51 \n\t" + +//<-----------jump here from first load + "2: \n\t" + + "xvcmpgedp 2,44,45 \n\t " + "xvcmpgedp 3,46,47 \n\t " + "xvcmpgedp 4,48,49 \n\t " + "xvcmpgedp 5,50,51 \n\t" + + "xxsel 32,40,41,2 \n\t" + "xxsel 0,44,45,2 \n\t" + "xxsel 33,42,43,3 \n\t" + "xxsel 1,46,47,3 \n\t" + "xxsel 34,40,41,4 \n\t" + "xxsel 45,48,49,4 \n\t" + "xxsel 35,42,43,5 \n\t" + "xxsel 47,50,51,5 \n\t" + + "xvcmpgedp 2,0, 1 \n\t" + "xvcmpgedp 3, 45,47 \n\t" + "xxsel 32,32,33,2 \n\t" + "xxsel 0 ,0,1,2 \n\t" + "xxsel 34,34,35,3 \n\t" + "xxsel 5,45,47,3 \n\t" + + "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" + // for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16} + "vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8} + + //load next 64 + "lxvd2x 44, 0,%[ptr_tmp] \n\t" + "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" + "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" + "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" + + //choose smaller from first and second part + "xvcmpgedp 4, 0,5 \n\t" + "xxsel 3, 0,5,4 \n\t" + "xxsel 33,32,34,4 \n\t" + + //load next 64 + "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" + "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" + + "vaddudm 1,1,5 \n\t" // get real index for first smaller + + "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + + + + //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) + "xvcmpgedp 2,39, 3 \n\t" + "xxsel 39,39,3,2 \n\t" + "xxsel 38,38,33,2 \n\t" + + //update index += 8 + "vaddudm 5,5,4 \n\t" + + "xvabsdp 44, 44 \n\t" + "xvabsdp 45, 45 \n\t" + "xvabsdp 46, 46 \n\t" + "xvabsdp 47, 47 \n\t" + + //update index += 8 + "vaddudm 5,5,4 \n\t" + + "xvabsdp 48, 48 \n\t" + "xvabsdp 49, 49 \n\t" + "xvabsdp 50, 50 \n\t" + "xvabsdp 51, 51 \n\t" + + //decrement n + "addic. %[n], %[n], -32 \n\t" + + //Loop back if >0 + "bgt+ 1b \n\t" + +//============================================================================== + + "xvcmpgedp 2,44,45 \n\t " + "xvcmpgedp 3,46,47 \n\t " + "xvcmpgedp 4,48,49 \n\t " + "xvcmpgedp 5,50,51 \n\t" + + "xxsel 32,40,41,2 \n\t" + "xxsel 0,44,45,2 \n\t" + "xxsel 33,42,43,3 \n\t" + "xxsel 1,46,47,3 \n\t" + "xxsel 34,40,41,4 \n\t" + "xxsel 45,48,49,4 \n\t" + "xxsel 35,42,43,5 \n\t" + "xxsel 47,50,51,5 \n\t" + + "xvcmpgedp 2,0, 1 \n\t" + "xvcmpgedp 3, 45,47 \n\t" + + + "xxsel 32,32,33,2 \n\t" + "xxsel 0 ,0,1,2 \n\t" + "xxsel 34,34,35,3 \n\t" + "xxsel 5,45,47,3 \n\t" + + // for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16} + "vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8} + //choose smaller from first and second part + "xvcmpgedp 4, 0,5 \n\t" + "xxsel 3, 0,5,4 \n\t" + "xxsel 33,32,34,4 \n\t" + + "vaddudm 1,1,5 \n\t" // get real index for first smaller + + //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) + "xvcmpgedp 2,39, 3 \n\t" + "xxsel 39,39,3,2 \n\t" + "xxsel 38,38,33,2 \n\t" + + ///////extract min value and min index from vector + + "xxspltd 32,38,1 \n\t" + "xxspltd 40,39,1 \n\t" + "xvcmpeqdp. 2, 40,39 \n\t" + + //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely + //0b001110=14 + "bc 14,24, 3f \n\t" + "xvcmpgedp 4,39, 40 \n\t" + "xxsel 0,39,40,4 \n\t" + "xxsel 1,38,32,4 \n\t" + "stxsdx 0,0,%[ptr_minf] \n\t" + "b 4f \n\t" + + "3: \n\t" + //if elements value are equal then choose minimum index + "xxspltd 0,40,0 \n\t" + "vminud 0,0,6 \n\t" //vs32 vs38 + "xxlor 1,32,32 \n\t" + "stxsdx 0,0,%[ptr_minf] \n\t" + + + "4: \n\t" + "mfvsrd %[index],1 \n\t" + + : [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) + : [mem] "m"(*(const double (*)[n])x), [ptr_x] "b"(x), [ptr_minf] "b"(minf) , + [i16] "b"(16), [i32] "b"(32), [i48] "b"(48), + [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), + [start] "v"(start), [adder] "v"(temp_add_index) + : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", + "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" + ); + + return index; + +} + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + BLASLONG min = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (min); + minf = ABS(x[0]); //index's not incremented + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + min = diamin_kernel_32(n1, x, &minf); + i = n1; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/power/izamax.c b/kernel/power/izamax.c new file mode 100644 index 000000000..cfe78c8c0 --- /dev/null +++ b/kernel/power/izamax.c @@ -0,0 +1,362 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include + +#define ABS fabs +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + + + + +/** + * Find maximum index + * Warning: requirements n>0 and n % 16 == 0 + * @param n + * @param x pointer to the vector + * @param maxf (out) maximum absolute value .( only for output ) + * @return index + */ +static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { + + BLASLONG index; + register __vector long long start = {1,0}; + register __vector long long temp_add_index = {2, 2}; + __asm__( + + "lxvd2x 44, 0,%[ptr_tmp] \n\t" + "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" + "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" + "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" + "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" + "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" + "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + + "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 + "vaddudm 9,8,%[adder] \n\t" //{3,2} vs41 + "xxlxor 37,37 ,37 \n\t" //v5 v37 index_count + "vaddudm 10,9,%[adder] \n\t" //{5,4} vs42 + "xxlxor 38 ,38 ,38 \n\t" // v6 | vs38 vec_max_index + "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 + "xxlxor 39,39,39 \n\t" // vs39 vec_max_value is zero + "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 + "xxspltd 36,36,0 \n\t" + + + + "xvabsdp 44, 44 \n\t" + "xvabsdp 45, 45 \n\t" + "xvabsdp 46, 46 \n\t" + "xvabsdp 47, 47 \n\t" + "xvabsdp 48, 48 \n\t" + "xvabsdp 49, 49 \n\t" + "xvabsdp 50, 50 \n\t" + "xvabsdp 51, 51 \n\t" + + //jump first half forward + "b 2f \n\t" + + ".p2align 5 \n\t" + "1: \n\t" + + + "xxmrghd 0,44,45 \n\t" + "xxmrgld 1,44,45 \n\t" + "xxmrghd 2,46,47 \n\t" + "xxmrgld 3,46,47 \n\t" + "xxmrghd 4,48,49 \n\t" + "xxmrgld 5,48,49 \n\t" + "xxmrghd 44,50,51 \n\t" + "xxmrgld 45,50,51 \n\t" + + "xvadddp 46, 0,1 \n\t" + "xvadddp 47, 2,3 \n\t" + "xvadddp 48, 4,5 \n\t" + "xvadddp 49, 44,45 \n\t" + + + + "xvcmpgtdp 50,47,46 \n\t " + "xvcmpgtdp 51,49,48 \n\t " + + "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" + + "xxsel 32,40,41,50 \n\t" + "xxsel 0,46,47,50 \n\t" + "xxsel 33,42,43,51 \n\t" + "xxsel 1,48,49,51 \n\t" + + "lxvd2x 44, 0,%[ptr_tmp] \n\t" + "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" + + "xvcmpgtdp 2,1,0 \n\t " + "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" + "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" + + + "xxsel 32,32,33,2 \n\t" + "xxsel 3,0,1,2 \n\t" + + "vaddudm 0,0,5 \n\t" + + //cmp with previous + + "xvcmpgtdp 4,3,39 \n\t " + "vaddudm 5,5,4 \n\t" + + "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" + "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" + "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + //select with previous + "xxsel 38,38,32,4 \n\t" + "xxsel 39,39,3,4 \n\t" + + + + + "xvabsdp 44, 44 \n\t" + "xvabsdp 45, 45 \n\t" + "xvabsdp 46, 46 \n\t" + "xvabsdp 47, 47 \n\t" + "xvabsdp 48, 48 \n\t" + "xvabsdp 49, 49 \n\t" + "xvabsdp 50, 50 \n\t" + "xvabsdp 51, 51 \n\t" + + +//>>/////////////////////////////// half start + "2: \n\t" + "xxmrghd 0,44,45 \n\t" + "xxmrgld 1,44,45 \n\t" + "xxmrghd 2,46,47 \n\t" + "xxmrgld 3,46,47 \n\t" + "xxmrghd 4,48,49 \n\t" + "xxmrgld 5,48,49 \n\t" + "xxmrghd 44,50,51 \n\t" + "xxmrgld 45,50,51 \n\t" + + "xvadddp 46, 0,1 \n\t" + "xvadddp 47, 2,3 \n\t" + "xvadddp 48, 4,5 \n\t" + "xvadddp 49, 44,45 \n\t" + + "xvcmpgtdp 50,47,46 \n\t " + "xvcmpgtdp 51,49,48 \n\t " + + "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" + + "xxsel 32,40,41,50 \n\t" + "xxsel 0,46,47,50 \n\t" + "xxsel 33,42,43,51 \n\t" + "xxsel 1,48,49,51 \n\t" + + "lxvd2x 44, 0,%[ptr_tmp] \n\t" + "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" + + "xvcmpgtdp 2,1,0 \n\t " + "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" + "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" + + + "xxsel 32,32,33,2 \n\t" + "xxsel 3,0,1,2 \n\t" + + "vaddudm 0,0,5 \n\t" + + //cmp with previous + + "xvcmpgtdp 4,3,39 \n\t " + "vaddudm 5,5,4 \n\t" + + "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" + "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" + "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + //select with previous + "xxsel 38,38,32,4 \n\t" + "xxsel 39,39,3,4 \n\t" + + + "xvabsdp 44, 44 \n\t" + "xvabsdp 45, 45 \n\t" + "xvabsdp 46, 46 \n\t" + "xvabsdp 47, 47 \n\t" + "xvabsdp 48, 48 \n\t" + "xvabsdp 49, 49 \n\t" + "xvabsdp 50, 50 \n\t" + "xvabsdp 51, 51 \n\t" + + + //decrement n + "addic. %[n], %[n], -16 \n\t" + //Loop back if >0 + "bgt+ 1b \n\t" + + + "xxmrghd 0,44,45 \n\t" + "xxmrgld 1,44,45 \n\t" + "xxmrghd 2,46,47 \n\t" + "xxmrgld 3,46,47 \n\t" + "xxmrghd 4,48,49 \n\t" + "xxmrgld 5,48,49 \n\t" + "xxmrghd 44,50,51 \n\t" + "xxmrgld 45,50,51 \n\t" + + "xvadddp 46, 0,1 \n\t" + "xvadddp 47, 2,3 \n\t" + "xvadddp 48, 4,5 \n\t" + "xvadddp 49, 44,45 \n\t" + + + + "xvcmpgtdp 50,47,46 \n\t " + "xvcmpgtdp 51,49,48 \n\t " + + "xxsel 32,40,41,50 \n\t" + "xxsel 0,46,47,50 \n\t" + "xxsel 33,42,43,51 \n\t" + "xxsel 1,48,49,51 \n\t" + + "xvcmpgtdp 2,1,0 \n\t " + "xxsel 32,32,33,2 \n\t" + "xxsel 3,0,1,2 \n\t" + + "vaddudm 0,0,5 \n\t" + + "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" + //cmp with previous + + "xvcmpgtdp 4,3,39 \n\t " + "vaddudm 5,5,4 \n\t" + "xxsel 38,38,32,4 \n\t" + "xxsel 39,39,3,4 \n\t" + + + ///////extract max value and max index from vector + + "xxspltd 32,38,1 \n\t" + "xxspltd 40,39,1 \n\t" + "xvcmpeqdp. 2, 40,39 \n\t" + + //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely + //0b001110=14 + "bc 14,24, 3f \n\t" + "xvcmpgtdp 4, 40,39 \n\t" + "xxsel 0,39,40,4 \n\t" + "xxsel 1,38,32,4 \n\t" + "stxsdx 0,0,%[ptr_maxf] \n\t" + "b 4f \n\t" + + "3: \n\t" + //if elements value are equal then choose minimum index + "xxspltd 0,40,0 \n\t" + "vminud 0,0,6 \n\t" //vs32 vs38 + "xxlor 1,32,32 \n\t" + "stxsdx 0,0,%[ptr_maxf] \n\t" + + + "4: \n\t" + "mfvsrd %[index],1 \n\t" + + : [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) + : [mem] "m"(*(const double (*)[2*n])x), [ptr_x] "b"(x), [ptr_maxf] "b"(maxf) , + [i16] "b"(16), [i32] "b"(32), [i48] "b"(48), + [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), + [start] "v"(start), [adder] "v"(temp_add_index) + : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", + "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" + ); + + return index; + +} + + + + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + max = ziamax_kernel_16(n1, x, &maxf); + i = n1; + ix = n1 << 1; + } + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (max + 1); + + } else { + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (max + 1); + } + +} + + diff --git a/kernel/power/izamin.c b/kernel/power/izamin.c new file mode 100644 index 000000000..448247ffd --- /dev/null +++ b/kernel/power/izamin.c @@ -0,0 +1,361 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include + +#define ABS fabs +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + + +/** + * Find minimum index + * Warning: requirements n>0 and n % 16 == 0 + * @param n + * @param x pointer to the vector + * @param minf (out) minimum absolute value .( only for output ) + * @return minimum index + */ +static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { + + BLASLONG index; + register __vector long long start = {1,0}; + register __vector long long temp_add_index = {2, 2}; + __asm__( + + "lxvd2x 44, 0,%[ptr_tmp] \n\t" + "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" + "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" + "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" + "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" + "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" + "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + + "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 + "vaddudm 9,8,%[adder] \n\t" //{3,2} vs41 + "xxlxor 37,37 ,37 \n\t" //v5 v37 index_count + "vaddudm 10,9,%[adder] \n\t" //{5,4} vs42 + "xxlxor 38 ,38 ,38 \n\t" // v6 | vs38 vec_min_index + "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 + "lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value + "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 + "xxspltd 36,36,0 \n\t" + + + + "xvabsdp 44, 44 \n\t" + "xvabsdp 45, 45 \n\t" + "xvabsdp 46, 46 \n\t" + "xvabsdp 47, 47 \n\t" + "xvabsdp 48, 48 \n\t" + "xvabsdp 49, 49 \n\t" + "xvabsdp 50, 50 \n\t" + "xvabsdp 51, 51 \n\t" + + //jump first half forward + "b 2f \n\t" + + ".p2align 5 \n\t" + "1: \n\t" + + + "xxmrghd 0,44,45 \n\t" + "xxmrgld 1,44,45 \n\t" + "xxmrghd 2,46,47 \n\t" + "xxmrgld 3,46,47 \n\t" + "xxmrghd 4,48,49 \n\t" + "xxmrgld 5,48,49 \n\t" + "xxmrghd 44,50,51 \n\t" + "xxmrgld 45,50,51 \n\t" + + "xvadddp 46, 0,1 \n\t" + "xvadddp 47, 2,3 \n\t" + "xvadddp 48, 4,5 \n\t" + "xvadddp 49, 44,45 \n\t" + + + + "xvcmpgedp 50,46,47 \n\t " + "xvcmpgedp 51,48,49 \n\t " + + "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" + + "xxsel 32,40,41,50 \n\t" + "xxsel 0,46,47,50 \n\t" + "xxsel 33,42,43,51 \n\t" + "xxsel 1,48,49,51 \n\t" + + "lxvd2x 44, 0,%[ptr_tmp] \n\t" + "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" + + "xvcmpgedp 2,0,1 \n\t " + "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" + "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" + + + "xxsel 32,32,33,2 \n\t" + "xxsel 3,0,1,2 \n\t" + + "vaddudm 0,0,5 \n\t" + + //cmp with previous + + "xvcmpgedp 4,39,3 \n\t " + "vaddudm 5,5,4 \n\t" + + "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" + "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" + "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + //select with previous + "xxsel 38,38,32,4 \n\t" + "xxsel 39,39,3,4 \n\t" + + + + + "xvabsdp 44, 44 \n\t" + "xvabsdp 45, 45 \n\t" + "xvabsdp 46, 46 \n\t" + "xvabsdp 47, 47 \n\t" + "xvabsdp 48, 48 \n\t" + "xvabsdp 49, 49 \n\t" + "xvabsdp 50, 50 \n\t" + "xvabsdp 51, 51 \n\t" + + +//>>/////////////////////////////// half start + "2: \n\t" + "xxmrghd 0,44,45 \n\t" + "xxmrgld 1,44,45 \n\t" + "xxmrghd 2,46,47 \n\t" + "xxmrgld 3,46,47 \n\t" + "xxmrghd 4,48,49 \n\t" + "xxmrgld 5,48,49 \n\t" + "xxmrghd 44,50,51 \n\t" + "xxmrgld 45,50,51 \n\t" + + "xvadddp 46, 0,1 \n\t" + "xvadddp 47, 2,3 \n\t" + "xvadddp 48, 4,5 \n\t" + "xvadddp 49, 44,45 \n\t" + + "xvcmpgedp 50,46,47 \n\t " + "xvcmpgedp 51,48,49 \n\t " + + "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" + + "xxsel 32,40,41,50 \n\t" + "xxsel 0,46,47,50 \n\t" + "xxsel 33,42,43,51 \n\t" + "xxsel 1,48,49,51 \n\t" + + "lxvd2x 44, 0,%[ptr_tmp] \n\t" + "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" + + "xvcmpgedp 2,0,1 \n\t " + "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" + "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" + + + "xxsel 32,32,33,2 \n\t" + "xxsel 3,0,1,2 \n\t" + + "vaddudm 0,0,5 \n\t" + + //cmp with previous + + "xvcmpgedp 4,39,3 \n\t " + "vaddudm 5,5,4 \n\t" + + "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" + "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" + "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + //select with previous + "xxsel 38,38,32,4 \n\t" + "xxsel 39,39,3,4 \n\t" + + + "xvabsdp 44, 44 \n\t" + "xvabsdp 45, 45 \n\t" + "xvabsdp 46, 46 \n\t" + "xvabsdp 47, 47 \n\t" + "xvabsdp 48, 48 \n\t" + "xvabsdp 49, 49 \n\t" + "xvabsdp 50, 50 \n\t" + "xvabsdp 51, 51 \n\t" + + + //decrement n + "addic. %[n], %[n], -16 \n\t" + //Loop back if >0 + "bgt+ 1b \n\t" + + + "xxmrghd 0,44,45 \n\t" + "xxmrgld 1,44,45 \n\t" + "xxmrghd 2,46,47 \n\t" + "xxmrgld 3,46,47 \n\t" + "xxmrghd 4,48,49 \n\t" + "xxmrgld 5,48,49 \n\t" + "xxmrghd 44,50,51 \n\t" + "xxmrgld 45,50,51 \n\t" + + "xvadddp 46, 0,1 \n\t" + "xvadddp 47, 2,3 \n\t" + "xvadddp 48, 4,5 \n\t" + "xvadddp 49, 44,45 \n\t" + + + + "xvcmpgedp 50,46,47 \n\t " + "xvcmpgedp 51,48,49 \n\t " + + "xxsel 32,40,41,50 \n\t" + "xxsel 0,46,47,50 \n\t" + "xxsel 33,42,43,51 \n\t" + "xxsel 1,48,49,51 \n\t" + + "xvcmpgedp 2,0,1 \n\t " + "xxsel 32,32,33,2 \n\t" + "xxsel 3,0,1,2 \n\t" + + "vaddudm 0,0,5 \n\t" + + "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" + //cmp with previous + + "xvcmpgedp 4,39,3 \n\t " + "vaddudm 5,5,4 \n\t" + "xxsel 38,38,32,4 \n\t" + "xxsel 39,39,3,4 \n\t" + + + ///////extract min value and min index from vector + + "xxspltd 32,38,1 \n\t" + "xxspltd 40,39,1 \n\t" + "xvcmpeqdp. 2, 40,39 \n\t" + + //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely + //0b001110=14 + "bc 14,24, 3f \n\t" + "xvcmpgedp 4,39, 40 \n\t" + "xxsel 0,39,40,4 \n\t" + "xxsel 1,38,32,4 \n\t" + "stxsdx 0,0,%[ptr_minf] \n\t" + "b 4f \n\t" + + "3: \n\t" + //if elements value are equal then choose minimum index + "xxspltd 0,40,0 \n\t" + "vminud 0,0,6 \n\t" //vs32 vs38 + "xxlor 1,32,32 \n\t" + "stxsdx 0,0,%[ptr_minf] \n\t" + + + "4: \n\t" + "mfvsrd %[index],1 \n\t" + + : [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) + : [mem] "m"(*(const double (*)[2*n])x), [ptr_x] "b"(x), [ptr_minf] "b"(minf) , + [i16] "b"(16), [i32] "b"(32), [i48] "b"(48), + [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), + [start] "v"(start), [adder] "v"(temp_add_index) + : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", + "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" + ); + + return index; +} + + + + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf; + BLASLONG min=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(min); + + + if (inc_x == 1) { + minf = CABS1(x,0); //index will not be incremented + BLASLONG n1 = n & -16; + if (n1 > 0) { + + min = ziamin_kernel_16_TUNED(n1, x, &minf); + i = n1; + ix = n1 << 1; + } + + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (min + 1); + + } else { + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (min + 1); + } + +} + + diff --git a/kernel/power/zgemv_n_4.c b/kernel/power/zgemv_n_4.c new file mode 100644 index 000000000..8b250a7f1 --- /dev/null +++ b/kernel/power/zgemv_n_4.c @@ -0,0 +1,958 @@ +/*************************************************************************** +Copyright (c) 2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include +#include +#include "common.h" + +#define HAVE_KERNEL_4x4_VEC 1 +#define HAVE_KERNEL_4x2_VEC 1 +#define HAVE_KERNEL_4x1_VEC 1 +#define HAVE_KERNEL_ADDY 1 + +#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) +#include +#endif + +// +#define NBMAX 4096 + +#ifdef HAVE_KERNEL_4x4_VEC_ASM + +#elif HAVE_KERNEL_4x4_VEC + +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register __vector double vx0_r = {x[0], x[0]}; + register __vector double vx0_i = {-x[1], x[1]}; + register __vector double vx1_r = {x[2], x[2]}; + register __vector double vx1_i = {-x[3], x[3]}; + register __vector double vx2_r = {x[4], x[4]}; + register __vector double vx2_i = {-x[5], x[5]}; + register __vector double vx3_r = {x[6], x[6]}; + register __vector double vx3_i = {-x[7], x[7]}; + +#else + register __vector double vx0_r = {x[0], -x[0]}; + register __vector double vx0_i = {x[1], x[1]}; + register __vector double vx1_r = {x[2], -x[2]}; + register __vector double vx1_i = {x[3], x[3]}; + register __vector double vx2_r = {x[4], -x[4]}; + register __vector double vx2_i = {x[5], x[5]}; + register __vector double vx3_r = {x[6], -x[6]}; + register __vector double vx3_i = {x[7], x[7]}; +#endif + + register __vector double *vy = (__vector double *) y; + register __vector double *vptr_a0 = (__vector double *) a0; + register __vector double *vptr_a1 = (__vector double *) a1; + register __vector double *vptr_a2 = (__vector double *) a2; + register __vector double *vptr_a3 = (__vector double *) a3; + + + register __vector double vy_0; + register __vector double va0; + register __vector double va1; + register __vector double va2; + register __vector double va3; + register __vector double vy_1; + register __vector double va0_1; + register __vector double va1_1; + register __vector double va2_1; + register __vector double va3_1; + register __vector double vy_2; + register __vector double va0_2; + register __vector double va1_2; + register __vector double va2_2; + register __vector double va3_2; + register __vector double vy_3; + register __vector double va0_3; + register __vector double va1_3; + register __vector double va2_3; + register __vector double va3_3; + + BLASLONG i = 0; + while (i < n) { + + vy_0 = vy[i]; + va0 = vptr_a0[i]; + va1 = vptr_a1[i]; + va2 = vptr_a2[i]; + va3 = vptr_a3[i]; + + vy_1 = vy[i + 1]; + va0_1 = vptr_a0[i + 1]; + va1_1 = vptr_a1[i + 1]; + va2_1 = vptr_a2[i + 1]; + va3_1 = vptr_a3[i + 1]; + + vy_2 = vy[i + 2]; + va0_2 = vptr_a0[i + 2]; + va1_2 = vptr_a1[i + 2]; + va2_2 = vptr_a2[i + 2]; + va3_2 = vptr_a3[i + 2]; + + vy_3 = vy[i + 3]; + va0_3 = vptr_a0[i + 3]; + va1_3 = vptr_a1[i + 3]; + va2_3 = vptr_a2[i + 3]; + va3_3 = vptr_a3[i + 3]; + + vy_0 += va0*vx0_r; + vy_1 += va0_1*vx0_r; + vy_2 += va0_2*vx0_r; + vy_3 += va0_3*vx0_r; + + + vy_0 += va1*vx1_r; + vy_1 += va1_1*vx1_r; + vy_2 += va1_2*vx1_r; + vy_3 += va1_3*vx1_r; + + va0 = vec_xxpermdi(va0, va0, 2); + va0_1 = vec_xxpermdi(va0_1, va0_1, 2); + + + vy_0 += va2*vx2_r; + vy_1 += va2_1*vx2_r; + va0_2 = vec_xxpermdi(va0_2, va0_2, 2); + va0_3 = vec_xxpermdi(va0_3, va0_3, 2); + vy_2 += va2_2*vx2_r; + vy_3 += va2_3*vx2_r; + + va1 = vec_xxpermdi(va1, va1, 2); + va1_1 = vec_xxpermdi(va1_1, va1_1, 2); + + + vy_0 += va3*vx3_r; + vy_1 += va3_1*vx3_r; + + va1_2 = vec_xxpermdi(va1_2, va1_2, 2); + va1_3 = vec_xxpermdi(va1_3, va1_3, 2); + + vy_2 += va3_2*vx3_r; + vy_3 += va3_3*vx3_r; + + va2 = vec_xxpermdi(va2, va2, 2); + va2_1 = vec_xxpermdi(va2_1, va2_1, 2); + + + vy_0 += va0*vx0_i; + vy_1 += va0_1*vx0_i; + + va2_2 = vec_xxpermdi(va2_2, va2_2, 2); + va2_3 = vec_xxpermdi(va2_3, va2_3, 2); + + vy_2 += va0_2*vx0_i; + vy_3 += va0_3*vx0_i; + + va3 = vec_xxpermdi(va3, va3, 2); + va3_1 = vec_xxpermdi(va3_1, va3_1, 2); + + + vy_0 += va1*vx1_i; + vy_1 += va1_1*vx1_i; + + va3_2 = vec_xxpermdi(va3_2, va3_2, 2); + va3_3 = vec_xxpermdi(va3_3, va3_3, 2); + + vy_2 += va1_2*vx1_i; + vy_3 += va1_3*vx1_i; + + vy_0 += va2*vx2_i; + vy_1 += va2_1*vx2_i; + vy_2 += va2_2*vx2_i; + vy_3 += va2_3*vx2_i; + + vy_0 += va3*vx3_i; + vy_1 += va3_1*vx3_i; + vy_2 += va3_2*vx3_i; + vy_3 += va3_3*vx3_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + + + i += 4; + + + } + +} +#else + +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + + for (i = 0; i < 2 * n; i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; + y[i] += a2[i] * x[4] - a2[i + 1] * x[5]; + y[i + 1] += a2[i] * x[5] + a2[i + 1] * x[4]; + y[i] += a3[i] * x[6] - a3[i + 1] * x[7]; + y[i + 1] += a3[i] * x[7] + a3[i + 1] * x[6]; +#else + y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; + y[i] += a2[i] * x[4] + a2[i + 1] * x[5]; + y[i + 1] += a2[i] * x[5] - a2[i + 1] * x[4]; + y[i] += a3[i] * x[6] + a3[i + 1] * x[7]; + y[i + 1] += a3[i] * x[7] - a3[i + 1] * x[6]; +#endif + } +} + +#endif + +#ifdef HAVE_KERNEL_4x2_VEC + +static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register __vector double vx0_r = {x[0], x[0]}; + register __vector double vx0_i = {-x[1], x[1]}; + register __vector double vx1_r = {x[2], x[2]}; + register __vector double vx1_i = {-x[3], x[3]}; + +#else + register __vector double vx0_r = {x[0], -x[0]}; + register __vector double vx0_i = {x[1], x[1]}; + register __vector double vx1_r = {x[2], -x[2]}; + register __vector double vx1_i = {x[3], x[3]}; +#endif + + + register __vector double *vy = (__vector double *) y; + register __vector double *vptr_a0 = (__vector double *) a0; + register __vector double *vptr_a1 = (__vector double *) a1; + + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vy[i]; + register __vector double vy_1 = vy[i + 1]; + register __vector double vy_2 = vy[i + 2]; + register __vector double vy_3 = vy[i + 3]; + + register __vector double va0 = vptr_a0[i]; + register __vector double va0_1 = vptr_a0[i + 1]; + register __vector double va0_2 = vptr_a0[i + 2]; + register __vector double va0_3 = vptr_a0[i + 3]; + + register __vector double va1 = vptr_a1[i]; + register __vector double va1_1 = vptr_a1[i + 1]; + register __vector double va1_2 = vptr_a1[i + 2]; + register __vector double va1_3 = vptr_a1[i + 3]; + + vy_0 += va0*vx0_r; + vy_1 += va0_1*vx0_r; + vy_2 += va0_2*vx0_r; + vy_3 += va0_3*vx0_r; + + va0 = vec_xxpermdi(va0, va0, 2); + va0_1 = vec_xxpermdi(va0_1, va0_1, 2); + va0_2 = vec_xxpermdi(va0_2, va0_2, 2); + va0_3 = vec_xxpermdi(va0_3, va0_3, 2); + + vy_0 += va1*vx1_r; + vy_1 += va1_1*vx1_r; + vy_2 += va1_2*vx1_r; + vy_3 += va1_3*vx1_r; + + va1 = vec_xxpermdi(va1, va1, 2); + va1_1 = vec_xxpermdi(va1_1, va1_1, 2); + va1_2 = vec_xxpermdi(va1_2, va1_2, 2); + va1_3 = vec_xxpermdi(va1_3, va1_3, 2); + + vy_0 += va0*vx0_i; + vy_1 += va0_1*vx0_i; + vy_2 += va0_2*vx0_i; + vy_3 += va0_3*vx0_i; + + vy_0 += va1*vx1_i; + vy_1 += va1_1*vx1_i; + vy_2 += va1_2*vx1_i; + vy_3 += va1_3*vx1_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + + } +} +#else + +static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + + for (i = 0; i < 2 * n; i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; +#else + y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; +#endif + } +} + +#endif + +#ifdef HAVE_KERNEL_4x1_VEC + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0; + a0 = ap; + + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register __vector double vx0_r = {x[0], x[0]}; + register __vector double vx0_i = {-x[1], x[1]}; + +#else + register __vector double vx0_r = {x[0], -x[0]}; + register __vector double vx0_i = {x[1], x[1]}; +#endif + + + register __vector double *vy = (__vector double *) y; + register __vector double *vptr_a0 = (__vector double *) a0; + + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vy[i]; + register __vector double vy_1 = vy[i + 1]; + register __vector double vy_2 = vy[i + 2]; + register __vector double vy_3 = vy[i + 3]; + + register __vector double va0 = vptr_a0[i]; + register __vector double va0_1 = vptr_a0[i + 1]; + register __vector double va0_2 = vptr_a0[i + 2]; + register __vector double va0_3 = vptr_a0[i + 3]; + + vy_0 += va0*vx0_r; + vy_1 += va0_1*vx0_r; + vy_2 += va0_2*vx0_r; + vy_3 += va0_3*vx0_r; + + va0 = vec_xxpermdi(va0, va0, 2); + va0_1 = vec_xxpermdi(va0_1, va0_1, 2); + va0_2 = vec_xxpermdi(va0_2, va0_2, 2); + va0_3 = vec_xxpermdi(va0_3, va0_3, 2); + + vy_0 += va0*vx0_i; + vy_1 += va0_1*vx0_i; + vy_2 += va0_2*vx0_i; + vy_3 += va0_3*vx0_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + + } +} + +#else + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0; + a0 = ap; + + for (i = 0; i < 2 * n; i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; +#else + y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; +#endif + + } +} + +#endif + +#ifdef HAVE_KERNEL_ADDY + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + + +#if !defined(XCONJ) + + register __vector double valpha_r = {alpha_r, alpha_r}; + register __vector double valpha_i = {-alpha_i, alpha_i}; + +#else + register __vector double valpha_r = {alpha_r, -alpha_r}; + register __vector double valpha_i = {alpha_i, alpha_i}; +#endif + + register __vector double *vptr_src = (__vector double *) src; + if (inc_dest != 2) { + register __vector double *vptr_y = (__vector double *) dest; + //note that inc_dest is already 2x. so we should add it to double* + register __vector double *vptr_y1 = (__vector double *) (dest + inc_dest); + register __vector double *vptr_y2 = (__vector double *) (dest + 2 * inc_dest); + register __vector double *vptr_y3 = (__vector double *) (dest + 3 * inc_dest); + BLASLONG dest_t = 0; + BLASLONG add_dest = inc_dest << 1; //inc_dest is already multiplied by 2, so for vector 4 we just multiply 2 times + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vptr_y[dest_t]; + register __vector double vy_1 = vptr_y1[dest_t]; + register __vector double vy_2 = vptr_y2[dest_t]; + register __vector double vy_3 = vptr_y3[dest_t]; + + register __vector double vsrc = vptr_src[i]; + register __vector double vsrc_1 = vptr_src[i + 1]; + register __vector double vsrc_2 = vptr_src[i + 2]; + register __vector double vsrc_3 = vptr_src[i + 3]; + + vy_0 += vsrc*valpha_r; + vy_1 += vsrc_1*valpha_r; + vy_2 += vsrc_2*valpha_r; + vy_3 += vsrc_3*valpha_r; + + vsrc = vec_xxpermdi(vsrc, vsrc, 2); + vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2); + vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2); + vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2); + + vy_0 += vsrc*valpha_i; + vy_1 += vsrc_1*valpha_i; + vy_2 += vsrc_2*valpha_i; + vy_3 += vsrc_3*valpha_i; + + vptr_y[dest_t] = vy_0; + vptr_y1[dest_t ] = vy_1; + vptr_y2[dest_t] = vy_2; + vptr_y3[dest_t] = vy_3; + + dest_t += add_dest; + + } + + return; + } else { + register __vector double *vptr_y = (__vector double *) dest; + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vptr_y[i]; + register __vector double vy_1 = vptr_y[i + 1]; + register __vector double vy_2 = vptr_y[i + 2]; + register __vector double vy_3 = vptr_y[i + 3]; + + register __vector double vsrc = vptr_src[i]; + register __vector double vsrc_1 = vptr_src[i + 1]; + register __vector double vsrc_2 = vptr_src[i + 2]; + register __vector double vsrc_3 = vptr_src[i + 3]; + + vy_0 += vsrc*valpha_r; + vy_1 += vsrc_1*valpha_r; + vy_2 += vsrc_2*valpha_r; + vy_3 += vsrc_3*valpha_r; + + vsrc = vec_xxpermdi(vsrc, vsrc, 2); + vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2); + vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2); + vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2); + + vy_0 += vsrc*valpha_i; + vy_1 += vsrc_1*valpha_i; + vy_2 += vsrc_2*valpha_i; + vy_3 += vsrc_3*valpha_i; + + vptr_y[i] = vy_0; + vptr_y[i + 1 ] = vy_1; + vptr_y[i + 2] = vy_2; + vptr_y[i + 3] = vy_3; + + } + + return; + } + return; +} + +#else + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + + if (inc_dest != 2) { + + FLOAT temp_r; + FLOAT temp_i; + for (i = 0; i < n; i++) { +#if !defined(XCONJ) + temp_r = alpha_r * src[0] - alpha_i * src[1]; + temp_i = alpha_r * src[1] + alpha_i * src[0]; +#else + temp_r = alpha_r * src[0] + alpha_i * src[1]; + temp_i = -alpha_r * src[1] + alpha_i * src[0]; +#endif + + *dest += temp_r; + *(dest + 1) += temp_i; + + src += 2; + dest += inc_dest; + } + return; + } + + FLOAT temp_r0; + FLOAT temp_i0; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT temp_r2; + FLOAT temp_i2; + FLOAT temp_r3; + FLOAT temp_i3; + for (i = 0; i < n; i += 4) { +#if !defined(XCONJ) + temp_r0 = alpha_r * src[0] - alpha_i * src[1]; + temp_i0 = alpha_r * src[1] + alpha_i * src[0]; + temp_r1 = alpha_r * src[2] - alpha_i * src[3]; + temp_i1 = alpha_r * src[3] + alpha_i * src[2]; + temp_r2 = alpha_r * src[4] - alpha_i * src[5]; + temp_i2 = alpha_r * src[5] + alpha_i * src[4]; + temp_r3 = alpha_r * src[6] - alpha_i * src[7]; + temp_i3 = alpha_r * src[7] + alpha_i * src[6]; +#else + temp_r0 = alpha_r * src[0] + alpha_i * src[1]; + temp_i0 = -alpha_r * src[1] + alpha_i * src[0]; + temp_r1 = alpha_r * src[2] + alpha_i * src[3]; + temp_i1 = -alpha_r * src[3] + alpha_i * src[2]; + temp_r2 = alpha_r * src[4] + alpha_i * src[5]; + temp_i2 = -alpha_r * src[5] + alpha_i * src[4]; + temp_r3 = alpha_r * src[6] + alpha_i * src[7]; + temp_i3 = -alpha_r * src[7] + alpha_i * src[6]; +#endif + + dest[0] += temp_r0; + dest[1] += temp_i0; + dest[2] += temp_r1; + dest[3] += temp_i1; + dest[4] += temp_r2; + dest[5] += temp_i2; + dest[6] += temp_r3; + dest[7] += temp_i3; + + src += 8; + dest += 8; + } + return; +} +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + + FLOAT xbuffer[8], *ybuffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + ybuffer = buffer; + + inc_x *= 2; + inc_y *= 2; + lda *= 2; + + n1 = n / 4; + n2 = n % 4; + + m3 = m % 4; + m1 = m - (m % 4); + m2 = (m % NBMAX) - (m % 4); + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + a_ptr = a; + + x_ptr = x; + //zero_y(NB,ybuffer); + memset(ybuffer, 0, NB * 16); + + if (inc_x == 2) { + + for (i = 0; i < n1; i++) { + zgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer); + + a_ptr += lda << 2; + x_ptr += 8; + } + + if (n2 & 2) { + zgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer); + x_ptr += 4; + a_ptr += 2 * lda; + + } + + if (n2 & 1) { + zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer); + x_ptr += 2; + a_ptr += lda; + + } + } else { + + for (i = 0; i < n1; i++) { + + xbuffer[0] = x_ptr[0]; + xbuffer[1] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + xbuffer[3] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[4] = x_ptr[0]; + xbuffer[5] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[6] = x_ptr[0]; + xbuffer[7] = x_ptr[1]; + x_ptr += inc_x; + + zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer); + + a_ptr += lda << 2; + } + + for (i = 0; i < n2; i++) { + xbuffer[0] = x_ptr[0]; + xbuffer[1] = x_ptr[1]; + x_ptr += inc_x; + zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer); + a_ptr += lda; + + } + + } + + add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i); + a += 2 * NB; + y_ptr += NB * inc_y; + } + + if (m3 == 0) return (0); + + if (m3 == 1) { + a_ptr = a; + x_ptr = x; + FLOAT temp_r = 0.0; + FLOAT temp_i = 0.0; + + if (lda == 2 && inc_x == 2) { + + for (i = 0; i < (n & -2); i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3]; + temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2]; +#else + temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3]; + temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2]; +#endif + + a_ptr += 4; + x_ptr += 4; + } + + for (; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; +#else + temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; +#endif + + a_ptr += 2; + x_ptr += 2; + } + + } else { + + for (i = 0; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; +#else + temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; +#endif + + a_ptr += lda; + x_ptr += inc_x; + } + + } +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + return (0); + } + + if (m3 == 2) { + a_ptr = a; + x_ptr = x; + FLOAT temp_r0 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_i1 = 0.0; + + if (lda == 4 && inc_x == 2) { + + for (i = 0; i < (n & -2); i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; + + temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3]; + temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2]; + temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3]; + temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; + + temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3]; + temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2]; + temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2]; +#endif + + a_ptr += 8; + x_ptr += 4; + } + + for (; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; +#endif + + a_ptr += 4; + x_ptr += 2; + } + + } else { + + for (i = 0; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; +#endif + + a_ptr += lda; + x_ptr += inc_x; + } + + } +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; +#else + y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; +#endif + return (0); + } + + if (m3 == 3) { + a_ptr = a; + x_ptr = x; + FLOAT temp_r0 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_i1 = 0.0; + FLOAT temp_r2 = 0.0; + FLOAT temp_i2 = 0.0; + + if (lda == 6 && inc_x == 2) { + + for (i = 0; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; +#endif + + a_ptr += 6; + x_ptr += 2; + } + + } else { + + for (i = 0; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; +#endif + + a_ptr += lda; + x_ptr += inc_x; + } + + } +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2; + y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2; +#else + y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2; + y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2; +#endif + return (0); + } + + return (0); +} + diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c new file mode 100644 index 000000000..572206494 --- /dev/null +++ b/kernel/power/zgemv_t_4.c @@ -0,0 +1,847 @@ +/*************************************************************************** +Copyright (c) 2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "common.h" + +#define NBMAX 4096 +#define HAVE_KERNEL_4x4_VEC 1 +#define HAVE_KERNEL_4x2_VEC 1 +#define HAVE_KERNEL_4x1_VEC 1 + +#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) +#include +#endif + +#ifdef HAVE_KERNEL_4x4_VEC_ASM + +#elif HAVE_KERNEL_4x4_VEC + +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + //p for positive(real*real,image*image) r for image (real*image,image*real) + register __vector double vtemp0_p = {0.0, 0.0}; + register __vector double vtemp0_r = {0.0, 0.0}; + register __vector double vtemp1_p = {0.0, 0.0}; + register __vector double vtemp1_r = {0.0, 0.0}; + register __vector double vtemp2_p = {0.0, 0.0}; + register __vector double vtemp2_r = {0.0, 0.0}; + register __vector double vtemp3_p = {0.0, 0.0}; + register __vector double vtemp3_r = {0.0, 0.0}; + i = 0; + n = n << 1; + while (i < n) { +// __builtin_prefetch(&x[i]); +// __builtin_prefetch(&a0[i]); +// __builtin_prefetch(&a1[i]); +// __builtin_prefetch(&a2[i]); +// __builtin_prefetch(&a3[i]); + register __vector double vx_0 = *(__vector double*) (&x[i]); + register __vector double vx_1 = *(__vector double*) (&x[i + 2]); + register __vector double vx_2 = *(__vector double*) (&x[i + 4]); + register __vector double vx_3 = *(__vector double*) (&x[i + 6]); + + register __vector double va0 = *(__vector double*) (&a0[i]); + register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); + register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); + register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); + + register __vector double va1 = *(__vector double*) (&a1[i]); + register __vector double va1_1 = *(__vector double*) (&a1[i + 2]); + register __vector double va1_2 = *(__vector double*) (&a1[i + 4]); + register __vector double va1_3 = *(__vector double*) (&a1[i + 6]); + + register __vector double va2 = *(__vector double*) (&a2[i]); + register __vector double va2_1 = *(__vector double*) (&a2[i + 2]); + register __vector double va2_2 = *(__vector double*) (&a2[i + 4]); + register __vector double va2_3 = *(__vector double*) (&a2[i + 6]); + + register __vector double va3 = *(__vector double*) (&a3[i]); + register __vector double va3_1 = *(__vector double*) (&a3[i + 2]); + register __vector double va3_2 = *(__vector double*) (&a3[i + 4]); + register __vector double va3_3 = *(__vector double*) (&a3[i + 6]); + + register __vector double vxr_0 = vec_xxpermdi(vx_0, vx_0, 2); + register __vector double vxr_1 = vec_xxpermdi(vx_1, vx_1, 2); + + i += 8; + + vtemp0_p += vx_0*va0; + vtemp0_r += vxr_0*va0; + + vtemp1_p += vx_0*va1; + vtemp1_r += vxr_0*va1; + + vtemp2_p += vx_0*va2; + vtemp2_r += vxr_0*va2; + + vtemp3_p += vx_0*va3; + vtemp3_r += vxr_0*va3; + + vtemp0_p += vx_1*va0_1; + vtemp0_r += vxr_1*va0_1; + + vtemp1_p += vx_1*va1_1; + vtemp1_r += vxr_1*va1_1; + vxr_0 = vec_xxpermdi(vx_2, vx_2, 2); + vtemp2_p += vx_1*va2_1; + vtemp2_r += vxr_1*va2_1; + + vtemp3_p += vx_1*va3_1; + vtemp3_r += vxr_1*va3_1; + + vtemp0_p += vx_2*va0_2; + vtemp0_r += vxr_0*va0_2; + vxr_1 = vec_xxpermdi(vx_3, vx_3, 2); + + vtemp1_p += vx_2*va1_2; + vtemp1_r += vxr_0*va1_2; + + vtemp2_p += vx_2*va2_2; + vtemp2_r += vxr_0*va2_2; + + vtemp3_p += vx_2*va3_2; + vtemp3_r += vxr_0*va3_2; + + vtemp0_p += vx_3*va0_3; + vtemp0_r += vxr_1*va0_3; + + vtemp1_p += vx_3*va1_3; + vtemp1_r += vxr_1*va1_3; + + vtemp2_p += vx_3*va2_3; + vtemp2_r += vxr_1*va2_3; + + vtemp3_p += vx_3*va3_3; + vtemp3_r += vxr_1*va3_3; + + } + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; + + register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1]; + register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1]; + + register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1]; + register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1]; + + register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1]; + register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1]; + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; + + register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1]; + register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1]; + + register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1]; + register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1]; + + register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1]; + register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; + y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; + y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; + y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; + +#endif +} + +#else + +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + + FLOAT temp_r0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_r2 = 0.0; + FLOAT temp_r3 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_i1 = 0.0; + FLOAT temp_i2 = 0.0; + FLOAT temp_i3 = 0.0; + + for (i = 0; i < 2 * n; i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; + temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; + temp_r1 += a1[i] * x[i] - a1[i + 1] * x[i + 1]; + temp_i1 += a1[i] * x[i + 1] + a1[i + 1] * x[i]; + temp_r2 += a2[i] * x[i] - a2[i + 1] * x[i + 1]; + temp_i2 += a2[i] * x[i + 1] + a2[i + 1] * x[i]; + temp_r3 += a3[i] * x[i] - a3[i + 1] * x[i + 1]; + temp_i3 += a3[i] * x[i + 1] + a3[i + 1] * x[i]; +#else + temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; + temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; + temp_r1 += a1[i] * x[i] + a1[i + 1] * x[i + 1]; + temp_i1 += a1[i] * x[i + 1] - a1[i + 1] * x[i]; + temp_r2 += a2[i] * x[i] + a2[i + 1] * x[i + 1]; + temp_i2 += a2[i] * x[i + 1] - a2[i + 1] * x[i]; + temp_r3 += a3[i] * x[i] + a3[i + 1] * x[i + 1]; + temp_i3 += a3[i] * x[i + 1] - a3[i + 1] * x[i]; +#endif + } + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; + y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; + y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; + y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; + +#endif +} + +#endif + +#ifdef HAVE_KERNEL_4x2_VEC + +static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + //p for positive(real*real,image*image) r for image (real*image,image*real) + register __vector double vtemp0_p = {0.0, 0.0}; + register __vector double vtemp0_r = {0.0, 0.0}; + register __vector double vtemp1_p = {0.0, 0.0}; + register __vector double vtemp1_r = {0.0, 0.0}; + i = 0; + n = n << 1; + while (i < n) { + + register __vector double vx_0 = *(__vector double*) (&x[i]); + register __vector double vx_1 = *(__vector double*) (&x[i + 2]); + register __vector double vx_2 = *(__vector double*) (&x[i + 4]); + register __vector double vx_3 = *(__vector double*) (&x[i + 6]); + + register __vector double va0 = *(__vector double*) (&a0[i]); + register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); + register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); + register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); + + register __vector double va1 = *(__vector double*) (&a1[i]); + register __vector double va1_1 = *(__vector double*) (&a1[i + 2]); + register __vector double va1_2 = *(__vector double*) (&a1[i + 4]); + register __vector double va1_3 = *(__vector double*) (&a1[i + 6]); + + register __vector double vxr_0 = vec_xxpermdi(vx_0, vx_0, 2); + register __vector double vxr_1 = vec_xxpermdi(vx_1, vx_1, 2); + + i += 8; + + vtemp0_p += vx_0*va0; + vtemp0_r += vxr_0*va0; + + vtemp1_p += vx_0*va1; + vtemp1_r += vxr_0*va1; + + vxr_0 = vec_xxpermdi(vx_2, vx_2, 2); + vtemp0_p += vx_1*va0_1; + vtemp0_r += vxr_1*va0_1; + + vtemp1_p += vx_1*va1_1; + vtemp1_r += vxr_1*va1_1; + vxr_1 = vec_xxpermdi(vx_3, vx_3, 2); + + vtemp0_p += vx_2*va0_2; + vtemp0_r += vxr_0*va0_2; + + vtemp1_p += vx_2*va1_2; + vtemp1_r += vxr_0*va1_2; + + vtemp0_p += vx_3*va0_3; + vtemp0_r += vxr_1*va0_3; + + vtemp1_p += vx_3*va1_3; + vtemp1_r += vxr_1*va1_3; + + } + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; + + register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1]; + register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1]; + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; + + register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1]; + register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + +#endif +} + +#else + +static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + + FLOAT temp_r0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_i1 = 0.0; + + for (i = 0; i < 2 * n; i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; + temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; + temp_r1 += a1[i] * x[i] - a1[i + 1] * x[i + 1]; + temp_i1 += a1[i] * x[i + 1] + a1[i + 1] * x[i]; +#else + temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; + temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; + temp_r1 += a1[i] * x[i] + a1[i + 1] * x[i + 1]; + temp_i1 += a1[i] * x[i + 1] - a1[i + 1] * x[i]; +#endif + } + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + +#endif +} + +#endif + +#ifdef HAVE_KERNEL_4x1_VEC + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + FLOAT *a0 ; + a0 = ap; + //p for positive(real*real,image*image) r for image (real*image,image*real) + register __vector double vtemp0_p = {0.0, 0.0}; + register __vector double vtemp0_r = {0.0, 0.0}; + i = 0; + n = n << 1; + while (i < n) { + + register __vector double vx_0 = *(__vector double*) (&x[i]); + register __vector double vx_1 = *(__vector double*) (&x[i + 2]); + register __vector double vx_2 = *(__vector double*) (&x[i + 4]); + register __vector double vx_3 = *(__vector double*) (&x[i + 6]); + + register __vector double va0 = *(__vector double*) (&a0[i]); + register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); + register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); + register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); + + register __vector double vxr_0 = vec_xxpermdi(vx_0, vx_0, 2); + register __vector double vxr_1 = vec_xxpermdi(vx_1, vx_1, 2); + + i += 8; + + vtemp0_p += vx_0*va0; + vtemp0_r += vxr_0*va0; + + vxr_0 = vec_xxpermdi(vx_2, vx_2, 2); + vtemp0_p += vx_1*va0_1; + vtemp0_r += vxr_1*va0_1; + + vxr_1 = vec_xxpermdi(vx_3, vx_3, 2); + + vtemp0_p += vx_2*va0_2; + vtemp0_r += vxr_0*va0_2; + + vtemp0_p += vx_3*va0_3; + vtemp0_r += vxr_1*va0_3; + + } + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; +#endif + +} + +#else + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + FLOAT *a0; + a0 = ap; + + FLOAT temp_r0 = 0.0; + FLOAT temp_i0 = 0.0; + + for (i = 0; i < 2 * n; i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; + temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; +#else + temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; + temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; +#endif + } + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + +#endif + +} + +#endif + +static __attribute__((always_inline)) void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest = *src; + *(dest + 1) = *(src + 1); + dest += 2; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + + FLOAT ybuffer[8], *xbuffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + inc_x <<= 1; + inc_y <<= 1; + lda <<= 1; + + xbuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 2) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + if (inc_y == 2) { + + for (i = 0; i < n1; i++) { + zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda << 2; + y_ptr += 8; + + } + + if (n2 & 2) { + zgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda << 1; + y_ptr += 4; + + } + + if (n2 & 1) { + zgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda; + y_ptr += 2; + + } + + } else { + + for (i = 0; i < n1; i++) { + memset(ybuffer, 0, sizeof (ybuffer)); + zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); + + a_ptr += lda << 2; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for (i = 0; i < n2; i++) { + memset(ybuffer, 0, sizeof (ybuffer)); + zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + if (m3 == 0) return (0); + + x_ptr = x; + j = 0; + a_ptr = a; + y_ptr = y; + + if (m3 == 3) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while (j < n) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } + + if (m3 == 2) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + + while (j < (n & -2)) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } + + while (j < n) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + + return (0); + } + + if (m3 == 1) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + + while (j < (n & -2)) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } + + while (j < n) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } + + return (0); + +} + diff --git a/kernel/power/zrot.c b/kernel/power/zrot.c new file mode 100644 index 000000000..d45468fd5 --- /dev/null +++ b/kernel/power/zrot.c @@ -0,0 +1,265 @@ +/*************************************************************************** +Copyright (c) 2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA) +{ + __vector double t0; + __vector double t1; + __vector double t2; + __vector double t3; + __vector double t4; + __vector double t5; + __vector double t6; + __vector double t7; + + __asm__ + ( + "xxspltd 36, %x[cos], 0 \n\t" // load c to both dwords + "xxspltd 37, %x[sin], 0 \n\t" // load s to both dwords + + "lxvd2x 32, 0, %[x_ptr] \n\t" // load x + "lxvd2x 33, %[i16], %[x_ptr] \n\t" + "lxvd2x 34, %[i32], %[x_ptr] \n\t" + "lxvd2x 35, %[i48], %[x_ptr] \n\t" + + "lxvd2x 48, 0, %[y_ptr] \n\t" // load y + "lxvd2x 49, %[i16], %[y_ptr] \n\t" + "lxvd2x 50, %[i32], %[y_ptr] \n\t" + "lxvd2x 51, %[i48], %[y_ptr] \n\t" + + "addi %[x_ptr], %[x_ptr], 64 \n\t" + "addi %[y_ptr], %[y_ptr], 64 \n\t" + + "addic. %[temp_n], %[temp_n], -4 \n\t" + "ble 2f \n\t" + + ".p2align 5 \n" + "1: \n\t" + + "xvmuldp 40, 32, 36 \n\t" // c * x + "xvmuldp 41, 33, 36 \n\t" + "xvmuldp 42, 34, 36 \n\t" + "xvmuldp 43, 35, 36 \n\t" + + "xvmuldp %x[x0], 48, 36 \n\t" // c * y + "xvmuldp %x[x1], 49, 36 \n\t" + "xvmuldp %x[x2], 50, 36 \n\t" + "xvmuldp %x[x3], 51, 36 \n\t" + + "xvmuldp 44, 32, 37 \n\t" // s * x + "xvmuldp 45, 33, 37 \n\t" + + "lxvd2x 32, 0, %[x_ptr] \n\t" // load x + "lxvd2x 33, %[i16],%[x_ptr] \n\t" + + "xvmuldp 46, 34, 37 \n\t" + "xvmuldp 47, 35, 37 \n\t" + + "lxvd2x 34, %[i32], %[x_ptr] \n\t" + "lxvd2x 35, %[i48], %[x_ptr] \n\t" + + "xvmuldp %x[x4], 48, 37 \n\t" // s * y + "xvmuldp %x[x5], 49, 37 \n\t" + + "lxvd2x 48, 0, %[y_ptr] \n\t" // load y + "lxvd2x 49, %[i16], %[y_ptr] \n\t" + + "xvmuldp %x[x6], 50, 37 \n\t" + "xvmuldp %x[x7], 51, 37 \n\t" + + "lxvd2x 50, %[i32], %[y_ptr] \n\t" + "lxvd2x 51, %[i48], %[y_ptr] \n\t" + + "xvadddp 40, 40, %x[x4] \n\t" // c * x + s * y + "xvadddp 41, 41, %x[x5] \n\t" // c * x + s * y + + "addi %[x_ptr], %[x_ptr], -64 \n\t" + "addi %[y_ptr], %[y_ptr], -64 \n\t" + + "xvadddp 42, 42, %x[x6] \n\t" // c * x + s * y + "xvadddp 43, 43, %x[x7] \n\t" // c * x + s * y + + "xvsubdp %x[x0], %x[x0], 44 \n\t" // c * y - s * x + "xvsubdp %x[x1], %x[x1], 45 \n\t" // c * y - s * x + "xvsubdp %x[x2], %x[x2], 46 \n\t" // c * y - s * x + "xvsubdp %x[x3], %x[x3], 47 \n\t" // c * y - s * x + + "stxvd2x 40, 0, %[x_ptr] \n\t" // store x + "stxvd2x 41, %[i16], %[x_ptr] \n\t" + "stxvd2x 42, %[i32], %[x_ptr] \n\t" + "stxvd2x 43, %[i48], %[x_ptr] \n\t" + + "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y + "stxvd2x %x[x1], %[i16], %[y_ptr] \n\t" + "stxvd2x %x[x2], %[i32], %[y_ptr] \n\t" + "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" + + "addi %[x_ptr], %[x_ptr], 128 \n\t" + "addi %[y_ptr], %[y_ptr], 128 \n\t" + + "addic. %[temp_n], %[temp_n], -4 \n\t" + "bgt+ 1b \n" + + "2: \n\t" + + "xvmuldp 40, 32, 36 \n\t" // c * x + "xvmuldp 41, 33, 36 \n\t" + "xvmuldp 42, 34, 36 \n\t" + "xvmuldp 43, 35, 36 \n\t" + + "xvmuldp %x[x0], 48, 36 \n\t" // c * y + "xvmuldp %x[x1], 49, 36 \n\t" + "xvmuldp %x[x2], 50, 36 \n\t" + "xvmuldp %x[x3], 51, 36 \n\t" + + "xvmuldp 44, 32, 37 \n\t" // s * x + "xvmuldp 45, 33, 37 \n\t" + "xvmuldp 46, 34, 37 \n\t" + "xvmuldp 47, 35, 37 \n\t" + + "xvmuldp %x[x4], 48, 37 \n\t" // s * y + "xvmuldp %x[x5], 49, 37 \n\t" + "xvmuldp %x[x6], 50, 37 \n\t" + "xvmuldp %x[x7], 51, 37 \n\t" + + "addi %[x_ptr], %[x_ptr], -64 \n\t" + "addi %[y_ptr], %[y_ptr], -64 \n\t" + + "xvadddp 40, 40, %x[x4] \n\t" // c * x + s * y + "xvadddp 41, 41, %x[x5] \n\t" // c * x + s * y + "xvadddp 42, 42, %x[x6] \n\t" // c * x + s * y + "xvadddp 43, 43, %x[x7] \n\t" // c * x + s * y + + "xvsubdp %x[x0], %x[x0], 44 \n\t" // c * y - s * x + "xvsubdp %x[x1], %x[x1], 45 \n\t" // c * y - s * x + "xvsubdp %x[x2], %x[x2], 46 \n\t" // c * y - s * x + "xvsubdp %x[x3], %x[x3], 47 \n\t" // c * y - s * x + + "stxvd2x 40, 0, %[x_ptr] \n\t" // store x + "stxvd2x 41, %[i16], %[x_ptr] \n\t" + "stxvd2x 42, %[i32], %[x_ptr] \n\t" + "stxvd2x 43, %[i48], %[x_ptr] \n\t" + + "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y + "stxvd2x %x[x1], %[i16], %[y_ptr] \n\t" + "stxvd2x %x[x2], %[i32], %[y_ptr] \n\t" + "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" + + + : + [mem_x] "+m" (*(double (*)[2*n])x), + [mem_y] "+m" (*(double (*)[2*n])y), + [temp_n] "+&r" (n), + [x_ptr] "+&b"(x), [y_ptr] "+&b"(y), + [x0] "=wa" (t0), + [x1] "=wa" (t1), + [x2] "=wa" (t2), + [x3] "=wa" (t3), + [x4] "=wa" (t4), + [x5] "=wa" (t5), + [x6] "=wa" (t6), + [x7] "=wa" (t7) + : + [cos] "d" (cosA), + [sin] "d" (sinA), + [i16] "b" (16), + [i32] "b" (32), + [i48] "b" (48) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); + + return; + +} + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -4; + if ( n1 > 0 ) + { + zrot_kernel_4(n1, x, y, c, s); + i=n1; + ix=2*n1; + } + + while(i < n) + { + temp[0] = c*x[ix] + s*y[ix] ; + temp[1] = c*x[ix+1] + s*y[ix+1] ; + y[ix] = c*y[ix] - s*x[ix] ; + y[ix+1] = c*y[ix+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += 2 ; + i++ ; + + } + + } + else + { + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + +} + + \ No newline at end of file diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 6cec47458..212de25c8 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -28,126 +28,134 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" + static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) { - __asm__ ("pfd 1, 0(%[x_tmp]) \n\t" - "pfd 2, 0(%[y_tmp]) \n\t" - "lgdr %%r1,%[alpha_r] \n\t" - "vlvgp %%v28,%%r1,%%r1 \n\t" - "lgdr %%r1,%[alpha_i] \n\t" - "vlvgp %%v29,%%r1,%%r1 \n\t" - "sllg %[tmp],%[tmp],4 \n\t" - "xgr %%r1,%%r1 \n\t" + BLASLONG tempR1 ; + __asm__ ("pfd 1, 0(%[x_tmp]) \n\t" + "pfd 2, 0(%[y_tmp]) \n\t" +#if !defined(CONJ) + "lgdr %[t1],%[alpha_r] \n\t" + "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint + "lgdr %[t1],%[alpha_i] \n\t" + "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint + "vflcdb %%v29,%%v29 \n\t" //complement both + "vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i} + +#else + "lgdr %[t1],%[alpha_i] \n\t" + "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint + "lgdr %[t1],%[alpha_r] \n\t" + "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint + "vflcdb %%v28,%%v28 \n\t" //complement both + "vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r} +#endif + + "xgr %[t1],%[t1] \n\t" + "sllg %[tmp],%[tmp],4 \n\t" + "vl %%v30 , 0(%[t1],%[y_tmp]) \n\t" + "vl %%v31 , 16(%[t1],%[y_tmp]) \n\t" + "vl %%v6 , 32(%[t1],%[y_tmp]) \n\t" + "vl %%v7 , 48(%[t1],%[y_tmp]) \n\t" + "vl %%v20 , 0(%[t1],%[x_tmp]) \n\t" + "vl %%v21 , 16(%[t1],%[x_tmp]) \n\t" + "vl %%v22 , 32(%[t1],%[x_tmp]) \n\t" + "vl %%v23 , 48(%[t1],%[x_tmp]) \n\t" + "lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition + "j 2f \n\t" ".align 16 \n\t" "1: \n\t" - "pfd 1, 256(%%r1,%[x_tmp]) \n\t" - "pfd 2, 256(%%r1,%[y_tmp]) \n\t" - "vleg %%v16 , 0(%%r1,%[y_tmp]),0 \n\t" - "vleg %%v17 , 8(%%r1,%[y_tmp]),0 \n\t" - "vleg %%v16 , 16(%%r1,%[y_tmp]),1 \n\t" - "vleg %%v17 , 24(%%r1,%[y_tmp]),1 \n\t" - "vleg %%v18 , 32(%%r1,%[y_tmp]),0 \n\t" - "vleg %%v19 , 40(%%r1,%[y_tmp]),0 \n\t" - "vleg %%v18 , 48(%%r1,%[y_tmp]),1 \n\t" - "vleg %%v19 , 56(%%r1,%[y_tmp]),1 \n\t" - "vleg %%v24 , 0(%%r1,%[x_tmp]),0 \n\t" - "vleg %%v25 , 8(%%r1,%[x_tmp]),0 \n\t" - "vleg %%v24 , 16(%%r1,%[x_tmp]),1 \n\t" - "vleg %%v25 , 24(%%r1,%[x_tmp]),1 \n\t" - "vleg %%v26 , 32(%%r1,%[x_tmp]),0 \n\t" - "vleg %%v27 , 40(%%r1,%[x_tmp]),0 \n\t" - "vleg %%v26 , 48(%%r1,%[x_tmp]),1 \n\t" - "vleg %%v27 , 56(%%r1,%[x_tmp]),1 \n\t" -#if !defined(CONJ) - "vfmsdb %%v16, %%v25, %%v29,%%v16 \n\t" - "vfmadb %%v17, %%v24, %%v29, %%v17 \n\t" - "vfmsdb %%v18, %%v27, %%v29, %%v18 \n\t" - "vfmadb %%v19, %%v26, %%v29, %%v19 \n\t" + + "vpdi %%v24 , %%v20, %%v20, 4 \n\t" + "vpdi %%v25 , %%v21, %%v21, 4 \n\t" + "vpdi %%v26 , %%v22, %%v22, 4 \n\t" + "vpdi %%v27 , %%v23, %%v23, 4 \n\t" + "vfmadb %%v16, %%v20, %%v28, %%v16 \n\t" + "vfmadb %%v17, %%v21, %%v28, %%v17 \n\t" + "vfmadb %%v18, %%v22, %%v28, %%v18 \n\t" + "vfmadb %%v19, %%v23, %%v28, %%v19 \n\t" + "vl %%v30, 64(%[t1],%[y_tmp]) \n\t" + "vl %%v31, 80(%[t1],%[y_tmp]) \n\t" + "vl %%v6 , 96(%[t1],%[y_tmp]) \n\t" + "vl %%v7 , 112(%[t1],%[y_tmp]) \n\t" + "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" + "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" + "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" + "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" + "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" + "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" + "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" + "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" - "vfmsdb %%v16, %%v24, %%v28 ,%%v16 \n\t" - "vfmadb %%v17, %%v25, %%v28, %%v17 \n\t" - "vfmsdb %%v18, %%v26, %%v28, %%v18 \n\t" - "vfmadb %%v19, %%v27, %%v28, %%v19 \n\t" -#else - "vfmadb %%v16, %%v25, %%v29, %%v16 \n\t" - "vfmsdb %%v17, %%v25, %%v28, %%v17 \n\t" - "vfmadb %%v18, %%v27, %%v29, %%v18 \n\t" - "vfmsdb %%v19, %%v27, %%v28, %%v19 \n\t" - "vfmadb %%v16, %%v24, %%v28, %%v16 \n\t" - "vfmsdb %%v17, %%v24, %%v29, %%v17 \n\t" - "vfmadb %%v18, %%v26, %%v28, %%v18 \n\t" - "vfmsdb %%v19, %%v26, %%v29, %%v19 \n\t" + "vst %%v16 , 0(%[t1],%[y_tmp]) \n\t" + "vst %%v17 , 16(%[t1],%[y_tmp]) \n\t" + "vst %%v18 , 32(%[t1],%[y_tmp]) \n\t" + "vst %%v19 , 48(%[t1],%[y_tmp]) \n\t" + + "la %[t1],64(%[t1] ) \n\t" + "2: \n\t" + "pfd 1, 256(%[t1],%[x_tmp]) \n\t" + "pfd 2, 256(%[t1],%[y_tmp]) \n\t" + "vpdi %%v24 , %%v20, %%v20, 4 \n\t" + "vpdi %%v25 , %%v21, %%v21, 4 \n\t" + "vpdi %%v26 , %%v22, %%v22, 4 \n\t" + "vpdi %%v27 , %%v23, %%v23, 4 \n\t" -#endif - "vsteg %%v16 , 0(%%r1,%[y_tmp]),0 \n\t" - "vsteg %%v17 , 8(%%r1,%[y_tmp]),0 \n\t" - "vsteg %%v16 , 16(%%r1,%[y_tmp]),1 \n\t" - "vsteg %%v17 , 24(%%r1,%[y_tmp]),1 \n\t" + "vfmadb %%v30, %%v20, %%v28, %%v30 \n\t" + "vfmadb %%v31, %%v21, %%v28, %%v31 \n\t" + "vfmadb %%v6, %%v22, %%v28, %%v6 \n\t" + "vfmadb %%v7, %%v23, %%v28, %%v7 \n\t" + "vl %%v16, 64(%[t1],%[y_tmp]) \n\t" + "vl %%v17, 80(%[t1],%[y_tmp]) \n\t" + "vl %%v18, 96(%[t1],%[y_tmp]) \n\t" + "vl %%v19, 112(%[t1],%[y_tmp]) \n\t" + "vfmadb %%v30, %%v24, %%v29, %%v30 \n\t" + "vfmadb %%v31, %%v25, %%v29, %%v31 \n\t" + "vfmadb %%v6, %%v26, %%v29, %%v6 \n\t" + "vfmadb %%v7, %%v27, %%v29, %%v7 \n\t" - "vsteg %%v18 , 32(%%r1,%[y_tmp]),0 \n\t" - "vsteg %%v19 , 40(%%r1,%[y_tmp]),0 \n\t" - "vsteg %%v18 , 48(%%r1,%[y_tmp]),1 \n\t" - "vsteg %%v19 , 56(%%r1,%[y_tmp]),1 \n\t" + "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" + "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" + "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" + "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" - "vleg %%v20 , 64(%%r1,%[y_tmp]),0 \n\t" - "vleg %%v21 , 72(%%r1,%[y_tmp]),0 \n\t" - "vleg %%v20 , 80(%%r1,%[y_tmp]),1 \n\t" - "vleg %%v21 , 88(%%r1,%[y_tmp]),1 \n\t" + "vst %%v30 , 0(%[t1],%[y_tmp]) \n\t" + "vst %%v31 , 16(%[t1],%[y_tmp]) \n\t" + "vst %%v6 , 32(%[t1],%[y_tmp]) \n\t" + "vst %%v7 , 48(%[t1],%[y_tmp]) \n\t" + + "la %[t1],64(%[t1] ) \n\t" + - "vleg %%v22 , 96(%%r1,%[y_tmp]),0 \n\t" - "vleg %%v23 , 104(%%r1,%[y_tmp]),0 \n\t" - "vleg %%v22 , 112(%%r1,%[y_tmp]),1 \n\t" - "vleg %%v23 , 120(%%r1,%[y_tmp]),1 \n\t" + "clgrjl %[t1],%[tmp],1b \n\t" +//---------------------------------------------------------------------- + "vfmadb %%v16, %%v20, %%v28, %%v16 \n\t" + "vfmadb %%v17, %%v21, %%v28, %%v17 \n\t" + "vfmadb %%v18, %%v22, %%v28, %%v18 \n\t" + "vfmadb %%v19, %%v23, %%v28, %%v19 \n\t" + "vpdi %%v24 , %%v20, %%v20, 4 \n\t" + "vpdi %%v25 , %%v21, %%v21, 4 \n\t" + "vpdi %%v26 , %%v22, %%v22, 4 \n\t" + "vpdi %%v27 , %%v23, %%v23, 4 \n\t" + "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" + "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" + "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" + "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" - "vleg %%v24 , 64(%%r1,%[x_tmp]),0 \n\t" - "vleg %%v25 , 72(%%r1,%[x_tmp]),0 \n\t" - "vleg %%v24 , 80(%%r1,%[x_tmp]),1 \n\t" - "vleg %%v25 , 88(%%r1,%[x_tmp]),1 \n\t" + "vst %%v16 , 0(%[t1],%[y_tmp]) \n\t" + "vst %%v17 , 16(%[t1],%[y_tmp]) \n\t" + "vst %%v18 , 32(%[t1],%[y_tmp]) \n\t" + "vst %%v19 , 48(%[t1],%[y_tmp]) \n\t" - "vleg %%v26 , 96(%%r1,%[x_tmp]),0 \n\t" - "vleg %%v27 , 104(%%r1,%[x_tmp]),0 \n\t" - "vleg %%v26 , 112(%%r1,%[x_tmp]),1 \n\t" - "vleg %%v27 , 120(%%r1,%[x_tmp]),1 \n\t" -#if !defined(CONJ) - "vfmsdb %%v20, %%v25, %%v29,%%v20 \n\t" - "vfmadb %%v21, %%v24, %%v29, %%v21 \n\t" - "vfmsdb %%v22, %%v27, %%v29, %%v22 \n\t" - "vfmadb %%v23, %%v26, %%v29, %%v23 \n\t" - - "vfmsdb %%v20, %%v24, %%v28 ,%%v20 \n\t" - "vfmadb %%v21, %%v25, %%v28, %%v21 \n\t" - "vfmsdb %%v22, %%v26, %%v28, %%v22 \n\t" - "vfmadb %%v23, %%v27, %%v28, %%v23 \n\t" -#else - "vfmadb %%v20, %%v25, %%v29, %%v20 \n\t" - "vfmsdb %%v21, %%v25, %%v28, %%v21 \n\t" - "vfmadb %%v22, %%v27, %%v29, %%v22 \n\t" - "vfmsdb %%v23, %%v27, %%v28, %%v23 \n\t" - "vfmadb %%v20, %%v24, %%v28, %%v20 \n\t" - "vfmsdb %%v21, %%v24, %%v29, %%v21 \n\t" - "vfmadb %%v22, %%v26, %%v28, %%v22 \n\t" - "vfmsdb %%v23, %%v26, %%v29, %%v23 \n\t" -#endif - "vsteg %%v20 , 64(%%r1,%[y_tmp]),0 \n\t" - "vsteg %%v21 , 72(%%r1,%[y_tmp]),0 \n\t" - "vsteg %%v20 , 80(%%r1,%[y_tmp]),1 \n\t" - "vsteg %%v21 , 88(%%r1,%[y_tmp]),1 \n\t" - - "vsteg %%v22 , 96(%%r1,%[y_tmp]),0 \n\t" - "vsteg %%v23 , 104(%%r1,%[y_tmp]),0 \n\t" - "vsteg %%v22 , 112(%%r1,%[y_tmp]),1 \n\t" - "vsteg %%v23 , 120(%%r1,%[y_tmp]),1 \n\t" - - "la %%r1,128(%%r1) \n\t" - "clgrjl %%r1,%[tmp],1b \n\t" - : [mem_y] "+m" (*(double (*)[2*n])y),[tmp]"+&r"(n) + : [mem_y] "+m" (*(double (*)[2*n])y),[tmp]"+&r"(n) , [t1] "=&a" (tempR1) : [mem_x] "m" (*(const double (*)[2*n])x), [x_tmp] "a"(x), [y_tmp] "a"(y), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) - : "cc", "r1","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29" + : "cc", "v6","v7", "v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); } + int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0; BLASLONG ix = 0, iy = 0; diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index e2805749b..484db3073 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -23,439 +23,503 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ + *****************************************************************************/ #include #include #include "common.h" - #define HAVE_KERNEL_4x4_VEC 1 #define HAVE_KERNEL_4x2_VEC 1 #define HAVE_KERNEL_4x1_VEC 1 #define HAVE_KERNEL_ADDY 1 #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) - #include +#include #endif -/** - * if define IGNORE_TEMP_PERM we store and use ybuffer as {real,real} {img;img} - * of not we will retrieve and store normal way - */ -#if (defined(HAVE_KERNEL_4x4_VEC_ASM) || defined(HAVE_KERNEL_4x4_VEC) ) && defined(HAVE_KERNEL_4x2_VEC) && defined(HAVE_KERNEL_4x1_VEC) && defined(HAVE_KERNEL_ADDY) - // #define IGNORE_TEMP_PERM 1 -#endif - +// #define NBMAX 1024 #ifdef HAVE_KERNEL_4x4_VEC_ASM #elif HAVE_KERNEL_4x4_VEC -static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ + +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { BLASLONG i; - FLOAT *a0,*a1,*a2,*a3; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - - register __vector double vx0_r = {x[0],x[0]}; - register __vector double vx0_i = {x[1],x[1]}; - register __vector double vx1_r = {x[2],x[2]}; - register __vector double vx1_i = {x[3],x[3]}; - register __vector double vx2_r = {x[4],x[4]}; - register __vector double vx2_i = {x[5],x[5]}; - register __vector double vx3_r = {x[6],x[6]}; - register __vector double vx3_i = {x[7],x[7]}; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; -#ifdef IGNORE_TEMP_PERM - register __vector double *vy = (__vector double *)y; - register BLASLONG j=0; -#endif +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - for ( i=0; i< 2*n; i+=4 ) - { - -#ifdef IGNORE_TEMP_PERM - register __vector double vresult_r = vy[j]; - register __vector double vresult_i = vy[j+1]; + register __vector double vx0_r = {x[0], x[0]}; + register __vector double vx0_i = {-x[1], x[1]}; + register __vector double vx1_r = {x[2], x[2]}; + register __vector double vx1_i = {-x[3], x[3]}; + register __vector double vx2_r = {x[4], x[4]}; + register __vector double vx2_i = {-x[5], x[5]}; + register __vector double vx3_r = {x[6], x[6]}; + register __vector double vx3_i = {-x[7], x[7]}; -#else - register __vector double vresult_r = {y[i],y[i+2]}; - register __vector double vresult_i = {y[i+1],y[i+3]}; -#endif - register __vector double va0_r= {a0[i],a0[i+2]}; - register __vector double va0_i= {a0[i+1],a0[i+3]}; - register __vector double va1_r= {a1[i],a1[i+2]}; - register __vector double va1_i= {a1[i+1],a1[i+3]}; - register __vector double va2_r= {a2[i],a2[i+2]}; - register __vector double va2_i= {a2[i+1],a2[i+3]}; - register __vector double va3_r= {a3[i],a3[i+2]}; - register __vector double va3_i= {a3[i+1],a3[i+3]}; -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - vresult_r = va0_r * vx0_r - (va0_i*vx0_i -vresult_r) ; - vresult_i = vresult_i + va0_r * vx0_i + va0_i * vx0_r ; - vresult_r = va1_r * vx1_r - (va1_i*vx1_i -vresult_r) ; - vresult_i = vresult_i + va1_r * vx1_i + va1_i * vx1_r ; - vresult_r = va2_r * vx2_r - (va2_i*vx2_i -vresult_r) ; - vresult_i = vresult_i + va2_r * vx2_i + va2_i * vx2_r ; - vresult_r = va3_r * vx3_r - (va3_i*vx3_i -vresult_r) ; - vresult_i = vresult_i + va3_r * vx3_i + va3_i * vx3_r ; - -#else - vresult_r = vresult_r + va0_r * vx0_r + va0_i*vx0_i ; - vresult_i = va0_r * vx0_i - ( va0_i * vx0_r - vresult_i) ; - vresult_r = vresult_r + va1_r * vx1_r + va1_i*vx1_i ; - vresult_i = va1_r * vx1_i - ( va1_i * vx1_r - vresult_i) ; - vresult_r = vresult_r + va2_r * vx2_r + va2_i*vx2_i ; - vresult_i = va2_r * vx2_i - ( va2_i * vx2_r - vresult_i) ; - vresult_r = vresult_r + va3_r * vx3_r + va3_i*vx3_i ; - vresult_i = va3_r * vx3_i - ( va3_i * vx3_r - vresult_i) ; +#else + register __vector double vx0_r = {x[0], -x[0]}; + register __vector double vx0_i = {x[1], x[1]}; + register __vector double vx1_r = {x[2], -x[2]}; + register __vector double vx1_i = {x[3], x[3]}; + register __vector double vx2_r = {x[4], -x[4]}; + register __vector double vx2_i = {x[5], x[5]}; + register __vector double vx3_r = {x[6], -x[6]}; + register __vector double vx3_i = {x[7], x[7]}; #endif -#ifdef IGNORE_TEMP_PERM - vy[j] = vresult_r ; - vy[j+1] = vresult_i ; - j+=2; + register __vector double *vy = (__vector double *) y; + register __vector double *vptr_a0 = (__vector double *) a0; + register __vector double *vptr_a1 = (__vector double *) a1; + register __vector double *vptr_a2 = (__vector double *) a2; + register __vector double *vptr_a3 = (__vector double *) a3; -#else - y[i] = vresult_r[0]; - y[i+1] = vresult_i[0]; - y[i +2 ] = vresult_r[1]; - y[i + 3 ] = vresult_i[1]; -#endif - - } - + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vy[i]; + register __vector double vy_1 = vy[i + 1]; + register __vector double vy_2 = vy[i + 2]; + register __vector double vy_3 = vy[i + 3]; + + register __vector double va0 = vptr_a0[i]; + register __vector double va0_1 = vptr_a0[i + 1]; + register __vector double va0_2 = vptr_a0[i + 2]; + register __vector double va0_3 = vptr_a0[i + 3]; + + register __vector double va1 = vptr_a1[i]; + register __vector double va1_1 = vptr_a1[i + 1]; + register __vector double va1_2 = vptr_a1[i + 2]; + register __vector double va1_3 = vptr_a1[i + 3]; + + register __vector double va2 = vptr_a2[i]; + register __vector double va2_1 = vptr_a2[i + 1]; + register __vector double va2_2 = vptr_a2[i + 2]; + register __vector double va2_3 = vptr_a2[i + 3]; + + register __vector double va3 = vptr_a3[i]; + register __vector double va3_1 = vptr_a3[i + 1]; + register __vector double va3_2 = vptr_a3[i + 2]; + register __vector double va3_3 = vptr_a3[i + 3]; + + vy_0 += va0*vx0_r; + vy_1 += va0_1*vx0_r; + vy_2 += va0_2*vx0_r; + vy_3 += va0_3*vx0_r; + + vy_0 += va1*vx1_r; + vy_1 += va1_1*vx1_r; + vy_2 += va1_2*vx1_r; + vy_3 += va1_3*vx1_r; + + va0 = vec_permi(va0, va0, 2); + va0_1 = vec_permi(va0_1, va0_1, 2); + va0_2 = vec_permi(va0_2, va0_2, 2); + va0_3 = vec_permi(va0_3, va0_3, 2); + + vy_0 += va2*vx2_r; + vy_1 += va2_1*vx2_r; + vy_2 += va2_2*vx2_r; + vy_3 += va2_3*vx2_r; + + va1 = vec_permi(va1, va1, 2); + va1_1 = vec_permi(va1_1, va1_1, 2); + va1_2 = vec_permi(va1_2, va1_2, 2); + va1_3 = vec_permi(va1_3, va1_3, 2); + + vy_0 += va3*vx3_r; + vy_1 += va3_1*vx3_r; + vy_2 += va3_2*vx3_r; + vy_3 += va3_3*vx3_r; + + va2 = vec_permi(va2, va2, 2); + va2_1 = vec_permi(va2_1, va2_1, 2); + va2_2 = vec_permi(va2_2, va2_2, 2); + va2_3 = vec_permi(va2_3, va2_3, 2); + + vy_0 += va0*vx0_i; + vy_1 += va0_1*vx0_i; + vy_2 += va0_2*vx0_i; + vy_3 += va0_3*vx0_i; + + va3 = vec_permi(va3, va3, 2); + va3_1 = vec_permi(va3_1, va3_1, 2); + va3_2 = vec_permi(va3_2, va3_2, 2); + va3_3 = vec_permi(va3_3, va3_3, 2); + + vy_0 += va1*vx1_i; + vy_1 += va1_1*vx1_i; + vy_2 += va1_2*vx1_i; + vy_3 += va1_3*vx1_i; + + vy_0 += va2*vx2_i; + vy_1 += va2_1*vx2_i; + vy_2 += va2_2*vx2_i; + vy_3 += va2_3*vx2_i; + + vy_0 += va3*vx3_i; + vy_1 += va3_1*vx3_i; + vy_2 += va3_2*vx3_i; + vy_3 += va3_3*vx3_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + + } } - #else -static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - for ( i=0; i< 2*n; i+=2 ) - { +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + + for (i = 0; i < 2 * n; i += 2) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - y[i] += a0[i]*x[0] - a0[i+1] * x[1]; - y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; - y[i] += a1[i]*x[2] - a1[i+1] * x[3]; - y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; - y[i] += a2[i]*x[4] - a2[i+1] * x[5]; - y[i+1] += a2[i]*x[5] + a2[i+1] * x[4]; - y[i] += a3[i]*x[6] - a3[i+1] * x[7]; - y[i+1] += a3[i]*x[7] + a3[i+1] * x[6]; + y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; + y[i] += a2[i] * x[4] - a2[i + 1] * x[5]; + y[i + 1] += a2[i] * x[5] + a2[i + 1] * x[4]; + y[i] += a3[i] * x[6] - a3[i + 1] * x[7]; + y[i + 1] += a3[i] * x[7] + a3[i + 1] * x[6]; #else - y[i] += a0[i]*x[0] + a0[i+1] * x[1]; - y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; - y[i] += a1[i]*x[2] + a1[i+1] * x[3]; - y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; - y[i] += a2[i]*x[4] + a2[i+1] * x[5]; - y[i+1] += a2[i]*x[5] - a2[i+1] * x[4]; - y[i] += a3[i]*x[6] + a3[i+1] * x[7]; - y[i+1] += a3[i]*x[7] - a3[i+1] * x[6]; + y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; + y[i] += a2[i] * x[4] + a2[i + 1] * x[5]; + y[i + 1] += a2[i] * x[5] - a2[i + 1] * x[4]; + y[i] += a3[i] * x[6] + a3[i + 1] * x[7]; + y[i + 1] += a3[i] * x[7] - a3[i + 1] * x[6]; #endif } } - + #endif - - #ifdef HAVE_KERNEL_4x2_VEC -static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ +static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { BLASLONG i; - FLOAT *a0,*a1; - a0 = ap[0]; - a1 = ap[1]; - - register __vector double vx0_r = {x[0],x[0]}; - register __vector double vx0_i = {x[1],x[1]}; - register __vector double vx1_r = {x[2],x[2]}; - register __vector double vx1_i = {x[3],x[3]}; -#ifdef IGNORE_TEMP_PERM - register __vector double *vy = (__vector double *)y; - register BLASLONG j=0; -#endif + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; - for ( i=0; i< 2*n; i+=4 ) - { -#ifdef IGNORE_TEMP_PERM - register __vector double vresult_r = vy[j]; - register __vector double vresult_i = vy[j+1]; -#else - register __vector double vresult_r = {y[i],y[i+2]}; - register __vector double vresult_i = {y[i+1],y[i+3]}; -#endif - register __vector double va0_r= {a0[i],a0[i+2]}; - register __vector double va0_i= {a0[i+1],a0[i+3]}; - register __vector double va1_r= {a1[i],a1[i+2]}; - register __vector double va1_i= {a1[i+1],a1[i+3]}; -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - vresult_r = va0_r * vx0_r - (va0_i*vx0_i -vresult_r) ; - vresult_i = vresult_i + va0_r * vx0_i + va0_i * vx0_r ; - vresult_r = va1_r * vx1_r - (va1_i*vx1_i -vresult_r) ; - vresult_i = vresult_i + va1_r * vx1_i + va1_i * vx1_r ; - -#else - vresult_r = vresult_r + va0_r * vx0_r + va0_i*vx0_i ; - vresult_i = va0_r * vx0_i - ( va0_i * vx0_r - vresult_i) ; - vresult_r = vresult_r + va1_r * vx1_r + va1_i*vx1_i ; - vresult_i = va1_r * vx1_i - ( va1_i * vx1_r - vresult_i) ; -#endif - -#ifdef IGNORE_TEMP_PERM - vy[j] = vresult_r ; - vy[j+1] = vresult_i ; - j+=2; - -#else - y[i] = vresult_r[0]; - y[i+1] = vresult_i[0]; - y[i +2 ] = vresult_r[1]; - y[i + 3 ] = vresult_i[1]; -#endif - - } -} - -#else -static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - BLASLONG i; - FLOAT *a0,*a1; - a0 = ap[0]; - a1 = ap[1]; - - for ( i=0; i< 2*n; i+=2 ) - { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - y[i] += a0[i]*x[0] - a0[i+1] * x[1]; - y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; - y[i] += a1[i]*x[2] - a1[i+1] * x[3]; - y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; + + register __vector double vx0_r = {x[0], x[0]}; + register __vector double vx0_i = {-x[1], x[1]}; + register __vector double vx1_r = {x[2], x[2]}; + register __vector double vx1_i = {-x[3], x[3]}; + +#else + register __vector double vx0_r = {x[0], -x[0]}; + register __vector double vx0_i = {x[1], x[1]}; + register __vector double vx1_r = {x[2], -x[2]}; + register __vector double vx1_i = {x[3], x[3]}; +#endif + + + register __vector double *vy = (__vector double *) y; + register __vector double *vptr_a0 = (__vector double *) a0; + register __vector double *vptr_a1 = (__vector double *) a1; + + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vy[i]; + register __vector double vy_1 = vy[i + 1]; + register __vector double vy_2 = vy[i + 2]; + register __vector double vy_3 = vy[i + 3]; + + register __vector double va0 = vptr_a0[i]; + register __vector double va0_1 = vptr_a0[i + 1]; + register __vector double va0_2 = vptr_a0[i + 2]; + register __vector double va0_3 = vptr_a0[i + 3]; + + register __vector double va1 = vptr_a1[i]; + register __vector double va1_1 = vptr_a1[i + 1]; + register __vector double va1_2 = vptr_a1[i + 2]; + register __vector double va1_3 = vptr_a1[i + 3]; + + vy_0 += va0*vx0_r; + vy_1 += va0_1*vx0_r; + vy_2 += va0_2*vx0_r; + vy_3 += va0_3*vx0_r; + + va0 = vec_permi(va0, va0, 2); + va0_1 = vec_permi(va0_1, va0_1, 2); + va0_2 = vec_permi(va0_2, va0_2, 2); + va0_3 = vec_permi(va0_3, va0_3, 2); + + vy_0 += va1*vx1_r; + vy_1 += va1_1*vx1_r; + vy_2 += va1_2*vx1_r; + vy_3 += va1_3*vx1_r; + + va1 = vec_permi(va1, va1, 2); + va1_1 = vec_permi(va1_1, va1_1, 2); + va1_2 = vec_permi(va1_2, va1_2, 2); + va1_3 = vec_permi(va1_3, va1_3, 2); + + vy_0 += va0*vx0_i; + vy_1 += va0_1*vx0_i; + vy_2 += va0_2*vx0_i; + vy_3 += va0_3*vx0_i; + + vy_0 += va1*vx1_i; + vy_1 += va1_1*vx1_i; + vy_2 += va1_2*vx1_i; + vy_3 += va1_3*vx1_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + + } +} +#else + +static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + + for (i = 0; i < 2 * n; i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; #else - y[i] += a0[i]*x[0] + a0[i+1] * x[1]; - y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; - y[i] += a1[i]*x[2] + a1[i+1] * x[3]; - y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; + y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; #endif } } - + #endif - - - #ifdef HAVE_KERNEL_4x1_VEC -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) -{ +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { BLASLONG i; FLOAT *a0; a0 = ap; - - register __vector double vx_r = {x[0],x[0]}; - register __vector double vx_i = {x[1],x[1]}; -#ifdef IGNORE_TEMP_PERM - register __vector double *vy = (__vector double *)y; - register BLASLONG j=0; -#endif - for ( i=0; i< 2*n; i+=4 ) - { -#ifdef IGNORE_TEMP_PERM - register __vector double vresult_r = vy[j]; - register __vector double vresult_i = vy[j+1]; - -#else - register __vector double vresult_r = {y[i],y[i+2]}; - register __vector double vresult_i = {y[i+1],y[i+3]}; -#endif - register __vector double va0_r= {a0[i],a0[i+2]}; - register __vector double va0_i= {a0[i+1],a0[i+3]}; #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - vresult_r = va0_r * vx_r - (va0_i*vx_i -vresult_r) ; - vresult_i = vresult_i + va0_r * vx_i + va0_i * vx_r ; - -#else - vresult_r = vresult_r + va0_r * vx_r + va0_i*vx_i ; - vresult_i = va0_r * vx_i - ( va0_i * vx_r - vresult_i) ; - -// y[i] += a0[i]*x[0] + a0[i+1] * x[1]; -// y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; -#endif -#ifndef IGNORE_TEMP_PERM - y[i] = vresult_r[0]; - y[i+1] = vresult_i[0]; - y[i +2 ] = vresult_r[1]; - y[i + 3 ] = vresult_i[1]; + register __vector double vx0_r = {x[0], x[0]}; + register __vector double vx0_i = {-x[1], x[1]}; + #else - vy[j] = vresult_r ; - vy[j+1] = vresult_i ; - j+=2; -#endif + register __vector double vx0_r = {x[0], -x[0]}; + register __vector double vx0_i = {x[1], x[1]}; +#endif + + + register __vector double *vy = (__vector double *) y; + register __vector double *vptr_a0 = (__vector double *) a0; + + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vy[i]; + register __vector double vy_1 = vy[i + 1]; + register __vector double vy_2 = vy[i + 2]; + register __vector double vy_3 = vy[i + 3]; + + register __vector double va0 = vptr_a0[i]; + register __vector double va0_1 = vptr_a0[i + 1]; + register __vector double va0_2 = vptr_a0[i + 2]; + register __vector double va0_3 = vptr_a0[i + 3]; + + vy_0 += va0*vx0_r; + vy_1 += va0_1*vx0_r; + vy_2 += va0_2*vx0_r; + vy_3 += va0_3*vx0_r; + + va0 = vec_permi(va0, va0, 2); + va0_1 = vec_permi(va0_1, va0_1, 2); + va0_2 = vec_permi(va0_2, va0_2, 2); + va0_3 = vec_permi(va0_3, va0_3, 2); + + vy_0 += va0*vx0_i; + vy_1 += va0_1*vx0_i; + vy_2 += va0_2*vx0_i; + vy_3 += va0_3*vx0_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; } } #else -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) -{ +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { BLASLONG i; FLOAT *a0; a0 = ap; - for ( i=0; i< 2*n; i+=2 ) - { + for (i = 0; i < 2 * n; i += 2) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - y[i] += a0[i]*x[0] - a0[i+1] * x[1]; - y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; + y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; #else - y[i] += a0[i]*x[0] + a0[i+1] * x[1]; - y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; + y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; #endif } } - #endif - #ifdef HAVE_KERNEL_ADDY -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) -{ + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { BLASLONG i; -#ifdef IGNORE_TEMP_PERM - register __vector double *src_vec = (__vector double *)src; -#endif - register __vector double valpha_r = {alpha_r,alpha_r}; - register __vector double valpha_i = {alpha_i,alpha_i}; - register __vector double vresult_r; - register __vector double vresult_i; - if ( inc_dest != 2 ) - { - - - for ( i=0; i +#include #endif #ifdef HAVE_KERNEL_4x4_VEC_ASM #elif HAVE_KERNEL_4x4_VEC -static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ + +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { BLASLONG i; - FLOAT *a0,*a1,*a2,*a3; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - register __vector double vtemp0_r = {0.0,0.0}; - register __vector double vtemp0_i = {0.0,0.0}; - register __vector double vtemp1_r = {0.0,0.0}; - register __vector double vtemp1_i = {0.0,0.0}; - register __vector double vtemp2_r = {0.0,0.0}; - register __vector double vtemp2_i = {0.0,0.0}; - register __vector double vtemp3_r = {0.0,0.0}; - register __vector double vtemp3_i = {0.0,0.0}; - for ( i=0; i< 2*n; i+=4 ) - { - register __vector double vx_r = {x[i],x[i+2]}; - register __vector double vx_i = {x[i+1],x[i+3]}; - register __vector double va0_r= {a0[i],a0[i+2]}; - register __vector double va0_i= {a0[i+1],a0[i+3]}; - register __vector double va1_r= {a1[i],a1[i+2]}; - register __vector double va1_i= {a1[i+1],a1[i+3]}; - register __vector double va2_r= {a2[i],a2[i+2]}; - register __vector double va2_i= {a2[i+1],a2[i+3]}; - register __vector double va3_r= {a3[i],a3[i+2]}; - register __vector double va3_i= {a3[i+1],a3[i+3]}; - -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + //p for positive(real*real,image*image) r for image (real*image,image*real) + register __vector double vtemp0_p = {0.0, 0.0}; + register __vector double vtemp0_r = {0.0, 0.0}; + register __vector double vtemp1_p = {0.0, 0.0}; + register __vector double vtemp1_r = {0.0, 0.0}; + register __vector double vtemp2_p = {0.0, 0.0}; + register __vector double vtemp2_r = {0.0, 0.0}; + register __vector double vtemp3_p = {0.0, 0.0}; + register __vector double vtemp3_r = {0.0, 0.0}; + i = 0; + n = n << 1; + while (i < n) { +// __builtin_prefetch(&x[i]); +// __builtin_prefetch(&a0[i]); +// __builtin_prefetch(&a1[i]); +// __builtin_prefetch(&a2[i]); +// __builtin_prefetch(&a3[i]); + register __vector double vx_0 = *(__vector double*) (&x[i]); + register __vector double vx_1 = *(__vector double*) (&x[i + 2]); + register __vector double vx_2 = *(__vector double*) (&x[i + 4]); + register __vector double vx_3 = *(__vector double*) (&x[i + 6]); - vtemp0_r = va0_r * vx_r - (va0_i*vx_i -vtemp0_r) ; - vtemp0_i = vtemp0_i + va0_r * vx_i + va0_i * vx_r ; - vtemp1_r = va1_r * vx_r - (va1_i*vx_i -vtemp1_r) ; - vtemp1_i = vtemp1_i + va1_r * vx_i + va1_i * vx_r ; + register __vector double va0 = *(__vector double*) (&a0[i]); + register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); + register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); + register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); - vtemp2_r = va2_r * vx_r - (va2_i*vx_i -vtemp2_r) ; - vtemp2_i = vtemp2_i + va2_r * vx_i + va2_i * vx_r ; - vtemp3_r = va3_r * vx_r - (va3_i*vx_i -vtemp3_r) ; - vtemp3_i = vtemp3_i + va3_r * vx_i + va3_i * vx_r ; -#else + register __vector double va1 = *(__vector double*) (&a1[i]); + register __vector double va1_1 = *(__vector double*) (&a1[i + 2]); + register __vector double va1_2 = *(__vector double*) (&a1[i + 4]); + register __vector double va1_3 = *(__vector double*) (&a1[i + 6]); + register __vector double va2 = *(__vector double*) (&a2[i]); + register __vector double va2_1 = *(__vector double*) (&a2[i + 2]); + register __vector double va2_2 = *(__vector double*) (&a2[i + 4]); + register __vector double va2_3 = *(__vector double*) (&a2[i + 6]); + + register __vector double va3 = *(__vector double*) (&a3[i]); + register __vector double va3_1 = *(__vector double*) (&a3[i + 2]); + register __vector double va3_2 = *(__vector double*) (&a3[i + 4]); + register __vector double va3_3 = *(__vector double*) (&a3[i + 6]); + + register __vector double vxr_0 = vec_permi(vx_0, vx_0, 2); + register __vector double vxr_1 = vec_permi(vx_1, vx_1, 2); + + i += 8; + + vtemp0_p += vx_0*va0; + vtemp0_r += vxr_0*va0; + + vtemp1_p += vx_0*va1; + vtemp1_r += vxr_0*va1; + + vtemp2_p += vx_0*va2; + vtemp2_r += vxr_0*va2; + + vtemp3_p += vx_0*va3; + vtemp3_r += vxr_0*va3; + + vtemp0_p += vx_1*va0_1; + vtemp0_r += vxr_1*va0_1; + + vtemp1_p += vx_1*va1_1; + vtemp1_r += vxr_1*va1_1; + vxr_0 = vec_permi(vx_2, vx_2, 2); + vtemp2_p += vx_1*va2_1; + vtemp2_r += vxr_1*va2_1; + + vtemp3_p += vx_1*va3_1; + vtemp3_r += vxr_1*va3_1; + + vtemp0_p += vx_2*va0_2; + vtemp0_r += vxr_0*va0_2; + vxr_1 = vec_permi(vx_3, vx_3, 2); + + vtemp1_p += vx_2*va1_2; + vtemp1_r += vxr_0*va1_2; + + vtemp2_p += vx_2*va2_2; + vtemp2_r += vxr_0*va2_2; + + vtemp3_p += vx_2*va3_2; + vtemp3_r += vxr_0*va3_2; + + vtemp0_p += vx_3*va0_3; + vtemp0_r += vxr_1*va0_3; + + vtemp1_p += vx_3*va1_3; + vtemp1_r += vxr_1*va1_3; + + vtemp2_p += vx_3*va2_3; + vtemp2_r += vxr_1*va2_3; + + vtemp3_p += vx_3*va3_3; + vtemp3_r += vxr_1*va3_3; - vtemp0_r = vtemp0_r + va0_r * vx_r + va0_i*vx_i ; - vtemp0_i = va0_r * vx_i - ( va0_i * vx_r - vtemp0_i) ; - vtemp1_r = vtemp1_r + va1_r * vx_r + va1_i*vx_i ; - vtemp1_i = va1_r * vx_i - ( va1_i * vx_r - vtemp1_i); - vtemp2_r = vtemp2_r + va2_r * vx_r + va2_i*vx_i ; - vtemp2_i = va2_r * vx_i - ( va2_i * vx_r - vtemp2_i) ; - vtemp3_r = vtemp3_r + va3_r * vx_r + va3_i*vx_i ; - vtemp3_i = va3_r * vx_i - ( va3_i * vx_r - vtemp3_i); -#endif } - register FLOAT alpha_r = alpha[0] ; - register FLOAT alpha_i = alpha[1] ; - register FLOAT temp_r0 = vtemp0_r[0]+vtemp0_r[1] ; - register FLOAT temp_i0 = vtemp0_i[0]+vtemp0_i[1] ; - register FLOAT temp_r1 = vtemp1_r[0]+vtemp1_r[1] ; - register FLOAT temp_i1 = vtemp1_i[0]+vtemp1_i[1] ; +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register FLOAT temp_r2 = vtemp2_r[0]+vtemp2_r[1] ; - register FLOAT temp_i2 = vtemp2_i[0]+vtemp2_i[1] ; - register FLOAT temp_r3 = vtemp3_r[0]+vtemp3_r[1] ; - register FLOAT temp_i3 = vtemp3_i[0]+vtemp3_i[1] ; + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; + + register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1]; + register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1]; + + register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1]; + register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1]; + + register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1]; + register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1]; + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; + + register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1]; + register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1]; + + register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1]; + register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1]; + + register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1]; + register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1]; + +#endif #if !defined(XCONJ) - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; - y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; - y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; + y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; #else - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; - y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; - y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; + y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; + y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; #endif } #else -static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { BLASLONG i; - FLOAT *a0,*a1,*a2,*a3; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - FLOAT alpha_r = alpha[0]; - FLOAT alpha_i = alpha[1]; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + FLOAT temp_r0 = 0.0; FLOAT temp_r1 = 0.0; FLOAT temp_r2 = 0.0; @@ -154,423 +218,434 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT * FLOAT temp_i2 = 0.0; FLOAT temp_i3 = 0.0; - - for ( i=0; i< 2*n; i+=2 ) - { + for (i = 0; i < 2 * n; i += 2) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; - temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; - temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1]; - temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i]; - temp_r2 += a2[i]*x[i] - a2[i+1]*x[i+1]; - temp_i2 += a2[i]*x[i+1] + a2[i+1]*x[i]; - temp_r3 += a3[i]*x[i] - a3[i+1]*x[i+1]; - temp_i3 += a3[i]*x[i+1] + a3[i+1]*x[i]; + temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; + temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; + temp_r1 += a1[i] * x[i] - a1[i + 1] * x[i + 1]; + temp_i1 += a1[i] * x[i + 1] + a1[i + 1] * x[i]; + temp_r2 += a2[i] * x[i] - a2[i + 1] * x[i + 1]; + temp_i2 += a2[i] * x[i + 1] + a2[i + 1] * x[i]; + temp_r3 += a3[i] * x[i] - a3[i + 1] * x[i + 1]; + temp_i3 += a3[i] * x[i + 1] + a3[i + 1] * x[i]; #else - temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; - temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; - temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1]; - temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i]; - temp_r2 += a2[i]*x[i] + a2[i+1]*x[i+1]; - temp_i2 += a2[i]*x[i+1] - a2[i+1]*x[i]; - temp_r3 += a3[i]*x[i] + a3[i+1]*x[i+1]; - temp_i3 += a3[i]*x[i+1] - a3[i+1]*x[i]; + temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; + temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; + temp_r1 += a1[i] * x[i] + a1[i + 1] * x[i + 1]; + temp_i1 += a1[i] * x[i + 1] - a1[i + 1] * x[i]; + temp_r2 += a2[i] * x[i] + a2[i + 1] * x[i + 1]; + temp_i2 += a2[i] * x[i + 1] - a2[i + 1] * x[i]; + temp_r3 += a3[i] * x[i] + a3[i + 1] * x[i + 1]; + temp_i3 += a3[i] * x[i + 1] - a3[i + 1] * x[i]; #endif } #if !defined(XCONJ) - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; - y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; - y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; + y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; #else - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; - y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; - y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; + y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; + y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; #endif } - + #endif #ifdef HAVE_KERNEL_4x2_VEC - -static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ +static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { BLASLONG i; - FLOAT *a0,*a1; - a0 = ap[0]; - a1 = ap[1]; - register __vector double vtemp0_r = {0.0,0.0}; - register __vector double vtemp0_i = {0.0,0.0}; - register __vector double vtemp1_r = {0.0,0.0}; - register __vector double vtemp1_i = {0.0,0.0}; - for ( i=0; i< 2*n; i+=4 ) - { - register __vector double vx_r = {x[i],x[i+2]}; - register __vector double vx_i = {x[i+1],x[i+3]}; - register __vector double va0_r= {a0[i],a0[i+2]}; - register __vector double va0_i= {a0[i+1],a0[i+3]}; - register __vector double va1_r= {a1[i],a1[i+2]}; - register __vector double va1_i= {a1[i+1],a1[i+3]}; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + //p for positive(real*real,image*image) r for image (real*image,image*real) + register __vector double vtemp0_p = {0.0, 0.0}; + register __vector double vtemp0_r = {0.0, 0.0}; + register __vector double vtemp1_p = {0.0, 0.0}; + register __vector double vtemp1_r = {0.0, 0.0}; + i = 0; + n = n << 1; + while (i < n) { + register __vector double vx_0 = *(__vector double*) (&x[i]); + register __vector double vx_1 = *(__vector double*) (&x[i + 2]); + register __vector double vx_2 = *(__vector double*) (&x[i + 4]); + register __vector double vx_3 = *(__vector double*) (&x[i + 6]); - -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register __vector double va0 = *(__vector double*) (&a0[i]); + register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); + register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); + register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); - vtemp0_r = va0_r * vx_r - (va0_i*vx_i -vtemp0_r) ; - vtemp0_i = vtemp0_i + va0_r * vx_i + va0_i * vx_r ; - vtemp1_r = va1_r * vx_r - (va1_i*vx_i -vtemp1_r) ; - vtemp1_i = vtemp1_i + va1_r * vx_i + va1_i * vx_r ; -#else - vtemp0_r = vtemp0_r + va0_r * vx_r + va0_i*vx_i ; - vtemp0_i = va0_r * vx_i - ( va0_i * vx_r - vtemp0_i) ; - vtemp1_r = vtemp1_r + va1_r * vx_r + va1_i*vx_i ; - vtemp1_i = va1_r * vx_i - ( va1_i * vx_r - vtemp1_i); -#endif + register __vector double va1 = *(__vector double*) (&a1[i]); + register __vector double va1_1 = *(__vector double*) (&a1[i + 2]); + register __vector double va1_2 = *(__vector double*) (&a1[i + 4]); + register __vector double va1_3 = *(__vector double*) (&a1[i + 6]); + + register __vector double vxr_0 = vec_permi(vx_0, vx_0, 2); + register __vector double vxr_1 = vec_permi(vx_1, vx_1, 2); + + i += 8; + + vtemp0_p += vx_0*va0; + vtemp0_r += vxr_0*va0; + + vtemp1_p += vx_0*va1; + vtemp1_r += vxr_0*va1; + + vxr_0 = vec_permi(vx_2, vx_2, 2); + vtemp0_p += vx_1*va0_1; + vtemp0_r += vxr_1*va0_1; + + vtemp1_p += vx_1*va1_1; + vtemp1_r += vxr_1*va1_1; + vxr_1 = vec_permi(vx_3, vx_3, 2); + + vtemp0_p += vx_2*va0_2; + vtemp0_r += vxr_0*va0_2; + + vtemp1_p += vx_2*va1_2; + vtemp1_r += vxr_0*va1_2; + + vtemp0_p += vx_3*va0_3; + vtemp0_r += vxr_1*va0_3; + + vtemp1_p += vx_3*va1_3; + vtemp1_r += vxr_1*va1_3; + } - register FLOAT temp_r0 = vtemp0_r[0]+vtemp0_r[1] ; - register FLOAT temp_i0 = vtemp0_i[0]+vtemp0_i[1] ; - register FLOAT temp_r1 = vtemp1_r[0]+vtemp1_r[1] ; - register FLOAT temp_i1 = vtemp1_i[0]+vtemp1_i[1] ; - register FLOAT alpha_r = alpha[0] ; - register FLOAT alpha_i = alpha[1] ; +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; + + register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1]; + register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1]; + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; + + register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1]; + register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1]; + +#endif #if !defined(XCONJ) - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; #else - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; #endif } - + #else -static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ +static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { BLASLONG i; - FLOAT *a0,*a1; - a0 = ap[0]; - a1 = ap[1]; - FLOAT alpha_r = alpha[0]; - FLOAT alpha_i = alpha[1]; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + FLOAT temp_r0 = 0.0; FLOAT temp_r1 = 0.0; FLOAT temp_i0 = 0.0; FLOAT temp_i1 = 0.0; - - for ( i=0; i< 2*n; i+=2 ) - { + for (i = 0; i < 2 * n; i += 2) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; - temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; - temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1]; - temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i]; + temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; + temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; + temp_r1 += a1[i] * x[i] - a1[i + 1] * x[i + 1]; + temp_i1 += a1[i] * x[i + 1] + a1[i + 1] * x[i]; #else - temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; - temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; - temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1]; - temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i]; + temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; + temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; + temp_r1 += a1[i] * x[i] + a1[i + 1] * x[i + 1]; + temp_i1 += a1[i] * x[i + 1] - a1[i + 1] * x[i]; #endif } #if !defined(XCONJ) - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; #else - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; #endif } #endif - #ifdef HAVE_KERNEL_4x1_VEC -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - BLASLONG i; - FLOAT *a0; - a0 = ap; - - register __vector double vtemp_r = {0.0,0.0}; - register __vector double vtemp_i = {0.0,0.0}; - - for ( i=0; i< 2*n; i+=4 ) - { - register __vector double va0_r= {a0[i],a0[i+2]}; - register __vector double va0_i= {a0[i+1],a0[i+3]}; - register __vector double vx0_r = {x[i],x[i+2]}; - register __vector double vx0_i = {x[i+1],x[i+3]}; +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + FLOAT *a0 ; + a0 = ap; + //p for positive(real*real,image*image) r for image (real*image,image*real) + register __vector double vtemp0_p = {0.0, 0.0}; + register __vector double vtemp0_r = {0.0, 0.0}; + i = 0; + n = n << 1; + while (i < n) { + + register __vector double vx_0 = *(__vector double*) (&x[i]); + register __vector double vx_1 = *(__vector double*) (&x[i + 2]); + register __vector double vx_2 = *(__vector double*) (&x[i + 4]); + register __vector double vx_3 = *(__vector double*) (&x[i + 6]); + + register __vector double va0 = *(__vector double*) (&a0[i]); + register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); + register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); + register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); + + register __vector double vxr_0 = vec_permi(vx_0, vx_0, 2); + register __vector double vxr_1 = vec_permi(vx_1, vx_1, 2); + + i += 8; + + vtemp0_p += vx_0*va0; + vtemp0_r += vxr_0*va0; + + vxr_0 = vec_permi(vx_2, vx_2, 2); + vtemp0_p += vx_1*va0_1; + vtemp0_r += vxr_1*va0_1; + + vxr_1 = vec_permi(vx_3, vx_3, 2); + + vtemp0_p += vx_2*va0_2; + vtemp0_r += vxr_0*va0_2; + + vtemp0_p += vx_3*va0_3; + vtemp0_r += vxr_1*va0_3; + + } #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; - vtemp_r = va0_r * vx0_r - (va0_i*vx0_i -vtemp_r) ; - vtemp_i = vtemp_i + va0_r * vx0_i + va0_i * vx0_r ; #else - vtemp_r = vtemp_r + va0_r * vx0_r + va0_i*vx0_i ; - vtemp_i = va0_r * vx0_i - ( va0_i * vx0_r - vtemp_i) ; -#endif - } - - register FLOAT temp_r0 = vtemp_r[0]+vtemp_r[1] ; - register FLOAT temp_i0 = vtemp_i[0]+vtemp_i[1] ; - register FLOAT alpha_r = alpha[0] ; - register FLOAT alpha_i = alpha[1] ; + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; + +#endif #if !defined(XCONJ) - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; #else - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; #endif - } #else -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { BLASLONG i; FLOAT *a0; a0 = ap; - FLOAT alpha_r = alpha[0]; - FLOAT alpha_i = alpha[1]; + FLOAT temp_r0 = 0.0; FLOAT temp_i0 = 0.0; - for ( i=0; i< 2*n; i+=2 ) - { + for (i = 0; i < 2 * n; i += 2) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; - temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; + temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; + temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; #else - temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; - temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; + temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; + temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; #endif } #if !defined(XCONJ) - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; #else - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; #endif - } #endif - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) -{ - BLASLONG i; - for ( i=0; i> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; m1 = m - m3; - m2 = (m & (NBMAX-1)) - m3 ; - - alpha[0] = alpha_r; - alpha[1] = alpha_i; + m2 = (m & (NBMAX - 1)) - m3; BLASLONG NB = NBMAX; - while ( NB == NBMAX ) - { - + while (NB == NBMAX) { + m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; + if (m1 < 0) { + if (m2 == 0) break; NB = m2; } - + y_ptr = y; a_ptr = a; x_ptr = x; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - if ( inc_x != 2 ) - copy_x(NB,x_ptr,xbuffer,inc_x); + + if (inc_x != 2) + copy_x(NB, x_ptr, xbuffer, inc_x); else xbuffer = x_ptr; - - if ( inc_y == 2 ) - { - for( i = 0; i < n1 ; i++) - { - zgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; + if (inc_y == 2) { + + for (i = 0; i < n1; i++) { + zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda << 2; y_ptr += 8; - + } - if ( n2 & 2 ) - { - zgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); - a_ptr += lda * 2; + if (n2 & 2) { + zgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda << 1; y_ptr += 4; } - if ( n2 & 1 ) - { - zgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); + if (n2 & 1) { + zgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); a_ptr += lda; y_ptr += 2; } - } - else - { + } else { - for( i = 0; i < n1 ; i++) - { - memset(ybuffer,0,sizeof(ybuffer)); - zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; + for (i = 0; i < n1; i++) { + memset(ybuffer, 0, sizeof (ybuffer)); + zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); + + a_ptr += lda << 2; y_ptr[0] += ybuffer[0]; y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; + y_ptr += inc_y; y_ptr[0] += ybuffer[2]; y_ptr[1] += ybuffer[3]; - y_ptr += inc_y; + y_ptr += inc_y; y_ptr[0] += ybuffer[4]; y_ptr[1] += ybuffer[5]; - y_ptr += inc_y; + y_ptr += inc_y; y_ptr[0] += ybuffer[6]; y_ptr[1] += ybuffer[7]; - y_ptr += inc_y; + y_ptr += inc_y; } - for( i = 0; i < n2 ; i++) - { - memset(ybuffer,0,sizeof(ybuffer)); - zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); + for (i = 0; i < n2; i++) { + memset(ybuffer, 0, sizeof (ybuffer)); + zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); a_ptr += lda; y_ptr[0] += ybuffer[0]; y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; + y_ptr += inc_y; } } a += 2 * NB; - x += NB * inc_x; + x += NB * inc_x; } + if (m3 == 0) return (0); + x_ptr = x; + j = 0; + a_ptr = a; + y_ptr = y; - if ( m3 == 0 ) return(0); + if (m3 == 3) { - x_ptr = x; - j=0; - a_ptr = a; - y_ptr = y; - - if ( m3 == 3 ) - { - - FLOAT temp_r ; - FLOAT temp_i ; + FLOAT temp_r; + FLOAT temp_i; FLOAT x0 = x_ptr[0]; FLOAT x1 = x_ptr[1]; x_ptr += inc_x; @@ -579,21 +654,20 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, x_ptr += inc_x; FLOAT x4 = x_ptr[0]; FLOAT x5 = x_ptr[1]; - while ( j < n) - { + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; #endif @@ -610,91 +684,84 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, y_ptr += inc_y; j++; } - return(0); + return (0); } + if (m3 == 2) { - if ( m3 == 2 ) - { - - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT temp_r1 ; - FLOAT temp_i1 ; + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; FLOAT x0 = x_ptr[0]; FLOAT x1 = x_ptr[1]; x_ptr += inc_x; FLOAT x2 = x_ptr[0]; FLOAT x3 = x_ptr[1]; - FLOAT ar = alpha[0]; - FLOAT ai = alpha[1]; - while ( j < ( n & -2 )) - { + while (j < (n & -2)) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; #endif #if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 - ai * temp_i1; - y_ptr[1] += ar * temp_i1 + ai * temp_r1; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 + ai * temp_i1; - y_ptr[1] -= ar * temp_i1 - ai * temp_r1; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; #endif a_ptr += lda; y_ptr += inc_y; - j+=2; + j += 2; } - - while ( j < n) - { + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; #endif #if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; #endif a_ptr += lda; @@ -702,87 +769,79 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, j++; } - return(0); + return (0); } + if (m3 == 1) { - if ( m3 == 1 ) - { - - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT temp_r1 ; - FLOAT temp_i1 ; + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; FLOAT x0 = x_ptr[0]; FLOAT x1 = x_ptr[1]; - FLOAT ar = alpha[0]; - FLOAT ai = alpha[1]; - while ( j < ( n & -2 )) - { + while (j < (n & -2)) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; #endif #if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 - ai * temp_i1; - y_ptr[1] += ar * temp_i1 + ai * temp_r1; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 + ai * temp_i1; - y_ptr[1] -= ar * temp_i1 - ai * temp_r1; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; #endif a_ptr += lda; y_ptr += inc_y; - j+=2; + j += 2; } - while ( j < n) - { + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; #endif #if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; #endif a_ptr += lda; y_ptr += inc_y; j++; } - return(0); + return (0); } - return(0); - + return (0); } - diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index e9285e213..4764c0a52 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -30,74 +30,119 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zscal_kernel_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x) { - __asm__( + BLASLONG tempR1 ; + __asm__ ( + "pfd 2, 0(%[x_tmp]) \n\t" +#if !defined(CONJ) + "lgdr %[t1],%[alpha_r] \n\t" + "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint + "lgdr %[t1],%[alpha_i] \n\t" + "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint + "vflcdb %%v29,%%v29 \n\t" //complement both + "vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i} - "pfd 1, 0(%[x_ptr]) \n\t" - "lgdr %%r0,%[alpha_r] \n\t" - "vlvgp %%v24,%%r0,%%r0 \n\t" - "lgdr %%r0,%[alpha_i] \n\t" - "vlvgp %%v25,%%r0,%%r0 \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[x_ptr] \n\t" +#else + "lgdr %[t1],%[alpha_i] \n\t" + "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint + "lgdr %[t1],%[alpha_r] \n\t" + "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint + "vflcdb %%v28,%%v28 \n\t" //complement both + "vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r} +#endif + + "xgr %[t1],%[t1] \n\t" + "sllg %[tmp],%[tmp],4 \n\t" + "vl %%v20 , 0(%[t1],%[x_tmp]) \n\t" + "vl %%v21 , 16(%[t1],%[x_tmp]) \n\t" + "vl %%v22 , 32(%[t1],%[x_tmp]) \n\t" + "vl %%v23 , 48(%[t1],%[x_tmp]) \n\t" + + "lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition + "j 2f \n\t" ".align 16 \n\t" "1: \n\t" - "pfd 2, 256(%[x_ptr] ) \n\t" + + "vpdi %%v24 , %%v20, %%v20, 4 \n\t" + "vpdi %%v25 , %%v21, %%v21, 4 \n\t" + "vpdi %%v26 , %%v22, %%v22, 4 \n\t" + "vpdi %%v27 , %%v23, %%v23, 4 \n\t" + "vfmdb %%v16, %%v20, %%v28 \n\t" + "vfmdb %%v17, %%v21, %%v28 \n\t" + "vfmdb %%v18, %%v22, %%v28 \n\t" + "vfmdb %%v19, %%v23, %%v28 \n\t" + "vl %%v20, 64(%[t1],%[x_tmp]) \n\t" + "vl %%v21, 80(%[t1],%[x_tmp]) \n\t" + "vl %%v22, 96(%[t1],%[x_tmp]) \n\t" + "vl %%v23, 112(%[t1],%[x_tmp]) \n\t" + "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" + "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" + "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" + "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" - "vleg %%v20 , 0(%[x_ptr]),0 \n\t" - "vleg %%v21 , 8(%[x_ptr]),0 \n\t" - "vleg %%v20 , 16(%[x_ptr]),1 \n\t" - "vleg %%v21 , 24(%[x_ptr]),1 \n\t" - "vleg %%v22 , 32(%[x_ptr]),0 \n\t" - "vleg %%v23 , 40(%[x_ptr]),0 \n\t" - "vleg %%v22 , 48(%[x_ptr]),1 \n\t" - "vleg %%v23 , 56(%[x_ptr]),1 \n\t" - "vfmdb %%v16, %%v21, %%v25 \n\t" - "vfmdb %%v17, %%v20, %%v25 \n\t" - "vfmdb %%v18, %%v23, %%v25 \n\t" - "vfmdb %%v19, %%v22, %%v25 \n\t" - "vfmsdb %%v16, %%v20, %%v24 ,%%v16 \n\t" - "vfmadb %%v17, %%v21, %%v24, %%v17 \n\t" - "vfmsdb %%v18, %%v22, %%v24, %%v18 \n\t" - "vfmadb %%v19, %%v23, %%v24, %%v19 \n\t" - "vsteg %%v16 , 0(%[x_ptr]),0 \n\t" - "vsteg %%v17 , 8(%[x_ptr]),0 \n\t" - "vsteg %%v16 , 16(%[x_ptr]),1 \n\t" - "vsteg %%v17 , 24(%[x_ptr]),1 \n\t" - "vsteg %%v18 , 32(%[x_ptr]),0 \n\t" - "vsteg %%v19 , 40(%[x_ptr]),0 \n\t" - "vsteg %%v18 , 48(%[x_ptr]),1 \n\t" - "vsteg %%v19 , 56(%[x_ptr]),1 \n\t" - "vleg %%v20 , 64(%[x_ptr]),0 \n\t" - "vleg %%v21 , 72(%[x_ptr]),0 \n\t" - "vleg %%v20 , 80(%[x_ptr]),1 \n\t" - "vleg %%v21 , 88(%[x_ptr]),1 \n\t" - "vleg %%v22 , 96(%[x_ptr]),0 \n\t" - "vleg %%v23 , 104(%[x_ptr]),0 \n\t" - "vleg %%v22 , 112(%[x_ptr]),1 \n\t" - "vleg %%v23 , 120(%[x_ptr]),1 \n\t" - "vfmdb %%v16, %%v21, %%v25 \n\t" - "vfmdb %%v17, %%v20, %%v25 \n\t" - "vfmdb %%v18, %%v23, %%v25 \n\t" - "vfmdb %%v19, %%v22, %%v25 \n\t" - "vfmsdb %%v16, %%v20, %%v24 ,%%v16 \n\t" - "vfmadb %%v17, %%v21, %%v24, %%v17 \n\t" - "vfmsdb %%v18, %%v22, %%v24, %%v18 \n\t" - "vfmadb %%v19, %%v23, %%v24, %%v19 \n\t" - "vsteg %%v16 , 64(%[x_ptr]),0 \n\t" - "vsteg %%v17 , 72(%[x_ptr]),0 \n\t" - "vsteg %%v16 , 80(%[x_ptr]),1 \n\t" - "vsteg %%v17 , 88(%[x_ptr]),1 \n\t" - "vsteg %%v18 , 96(%[x_ptr]),0 \n\t" - "vsteg %%v19 , 104(%[x_ptr]),0 \n\t" - "vsteg %%v18 , 112(%[x_ptr]),1 \n\t" - "vsteg %%v19 , 120(%[x_ptr]),1 \n\t" - - "la %[x_ptr],128(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x) - : [n] "r"(n), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) - : "cc", "r0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25" + + "vst %%v16 , 0(%[t1],%[x_tmp]) \n\t" + "vst %%v17 , 16(%[t1],%[x_tmp]) \n\t" + "vst %%v18 , 32(%[t1],%[x_tmp]) \n\t" + "vst %%v19 , 48(%[t1],%[x_tmp]) \n\t" + + "la %[t1],64(%[t1] ) \n\t" + "2: \n\t" + "pfd 2, 256(%[t1],%[x_tmp]) \n\t" + "vpdi %%v24 , %%v20, %%v20, 4 \n\t" + "vpdi %%v25 , %%v21, %%v21, 4 \n\t" + "vpdi %%v26 , %%v22, %%v22, 4 \n\t" + "vpdi %%v27 , %%v23, %%v23, 4 \n\t" + + "vfmdb %%v30, %%v20, %%v28 \n\t" + "vfmdb %%v31, %%v21, %%v28 \n\t" + "vfmdb %%v6, %%v22, %%v28 \n\t" + "vfmdb %%v7, %%v23, %%v28 \n\t" + + "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" + "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" + "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" + "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" + + "vfmadb %%v30, %%v24, %%v29, %%v30 \n\t" + "vfmadb %%v31, %%v25, %%v29, %%v31 \n\t" + "vfmadb %%v6, %%v26, %%v29, %%v6 \n\t" + "vfmadb %%v7, %%v27, %%v29, %%v7 \n\t" + + + "vst %%v30 , 0(%[t1],%[x_tmp]) \n\t" + "vst %%v31 , 16(%[t1],%[x_tmp]) \n\t" + "vst %%v6 , 32(%[t1],%[x_tmp]) \n\t" + "vst %%v7 , 48(%[t1],%[x_tmp]) \n\t" + + "la %[t1],64(%[t1] ) \n\t" + + + "clgrjl %[t1],%[tmp],1b \n\t" +//---------------------------------------------------------------------- + "vfmdb %%v16, %%v20, %%v28 \n\t" + "vfmdb %%v17, %%v21, %%v28 \n\t" + "vfmdb %%v18, %%v22, %%v28 \n\t" + "vfmdb %%v19, %%v23, %%v28 \n\t" + "vpdi %%v24 , %%v20, %%v20, 4 \n\t" + "vpdi %%v25 , %%v21, %%v21, 4 \n\t" + "vpdi %%v26 , %%v22, %%v22, 4 \n\t" + "vpdi %%v27 , %%v23, %%v23, 4 \n\t" + "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" + "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" + "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" + "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" + + "vst %%v16 , 0(%[t1],%[x_tmp]) \n\t" + "vst %%v17 , 16(%[t1],%[x_tmp]) \n\t" + "vst %%v18 , 32(%[t1],%[x_tmp]) \n\t" + "vst %%v19 , 48(%[t1],%[x_tmp]) \n\t" + + : [mem_x] "+m" (*(double (*)[2*n])x),[tmp]"+&r"(n) , [t1] "=&a" (tempR1) + : [x_tmp] "a"(x), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) + : "cc", "v6","v7", "v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); + } From 35c5a323095f8ebdf2380a47021642534aa6b89c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 27 Mar 2018 21:52:29 +0200 Subject: [PATCH 09/23] Correct index variables used in MFlops calculation Fixes #1474 --- benchmark/gemm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/gemm.c b/benchmark/gemm.c index 809813c92..85bcbc710 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -237,7 +237,7 @@ int main(int argc, char *argv[]){ timeg = time1/loops; fprintf(stderr, " %10.2f MFlops %10.6f sec\n", - COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)n / timeg * 1.e-6, time1); + COMPSIZE * COMPSIZE * 2. * (double)k * (double)m * (double)n / timeg * 1.e-6, time1); } From 752fdb5dd8418c0ae56e308067c043b8fe39e695 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 29 Mar 2018 11:56:56 +0200 Subject: [PATCH 10/23] Add workaround for old gcc and clang versions Old gcc and clang do not handle constructor arguments, finally fix #875 as discussed there, using the fedora patch --- driver/others/memory.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/driver/others/memory.c b/driver/others/memory.c index 474d97c4d..1cb7519b4 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -148,8 +148,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CONSTRUCTOR __attribute__ ((constructor)) #define DESTRUCTOR __attribute__ ((destructor)) #else +#if __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) #define CONSTRUCTOR __attribute__ ((constructor(101))) #define DESTRUCTOR __attribute__ ((destructor(101))) +#elif __GNUC__ && INIT_PRIORITY +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) +#else +#define CONSTRUCTOR +#define DESTRUCTOR #endif #ifdef DYNAMIC_ARCH From 93db123f7e36fafff65d151cf10f95c54dee3608 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 29 Mar 2018 13:13:49 +0200 Subject: [PATCH 11/23] Update memory.c --- driver/others/memory.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 1cb7519b4..41937ca32 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -147,8 +147,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) #define CONSTRUCTOR __attribute__ ((constructor)) #define DESTRUCTOR __attribute__ ((destructor)) -#else -#if __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) +#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) #define CONSTRUCTOR __attribute__ ((constructor(101))) #define DESTRUCTOR __attribute__ ((destructor(101))) #elif __GNUC__ && INIT_PRIORITY From 01c4b82f045851615074fd1bdc7de06a8b253cf6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 31 Mar 2018 22:32:06 +0200 Subject: [PATCH 12/23] Update memory.c --- driver/others/memory.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 41937ca32..a6d4e636c 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -150,12 +150,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) #define CONSTRUCTOR __attribute__ ((constructor(101))) #define DESTRUCTOR __attribute__ ((destructor(101))) -#elif __GNUC__ && INIT_PRIORITY +#else #define CONSTRUCTOR __attribute__ ((constructor)) #define DESTRUCTOR __attribute__ ((destructor)) -#else -#define CONSTRUCTOR -#define DESTRUCTOR #endif #ifdef DYNAMIC_ARCH From 8da6b6ae52d0bfa86cf4f3935362039f033b13d9 Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Mon, 2 Apr 2018 10:48:22 -0700 Subject: [PATCH 13/23] Allow building on OpenBSD With this change, OpenBLAS builds and all tests pass on OpenBSD 6.2 using Clang. Tested on x86-64 only, with and without DYNAMIC_ARCH=1. --- Makefile | 6 +----- Makefile.install | 9 ++------- Makefile.system | 2 +- c_check | 1 + common.h | 2 +- common_x86.h | 2 +- common_x86_64.h | 2 +- ctest.c | 4 ++++ driver/others/blas_server.c | 2 +- driver/others/memory.c | 10 +++++----- exports/Makefile | 2 +- getarch.c | 6 +++--- 12 files changed, 22 insertions(+), 26 deletions(-) diff --git a/Makefile b/Makefile index 5198f9e2b..7818b3cd9 100644 --- a/Makefile +++ b/Makefile @@ -91,11 +91,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) @ln -fs $(LIBSONAME) $(LIBPREFIX).so @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif -ifeq ($(OSNAME), FreeBSD) - @$(MAKE) -C exports so - @ln -fs $(LIBSONAME) $(LIBPREFIX).so -endif -ifeq ($(OSNAME), NetBSD) +ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD)) @$(MAKE) -C exports so @ln -fs $(LIBSONAME) $(LIBPREFIX).so endif diff --git a/Makefile.install b/Makefile.install index 81d097215..e22c61da7 100644 --- a/Makefile.install +++ b/Makefile.install @@ -72,12 +72,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif -ifeq ($(OSNAME), FreeBSD) - @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" - @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ - ln -fs $(LIBSONAME) $(LIBPREFIX).so -endif -ifeq ($(OSNAME), NetBSD) +ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD)) @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so @@ -115,7 +110,7 @@ endif ifndef NO_SHARED #ifeq logical or -ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD)) @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) diff --git a/Makefile.system b/Makefile.system index 9720b317f..062e14b54 100644 --- a/Makefile.system +++ b/Makefile.system @@ -230,7 +230,7 @@ endif MD5SUM = md5 -r endif -ifeq ($(OSNAME), FreeBSD) +ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD)) MD5SUM = md5 -r endif diff --git a/c_check b/c_check index 20da288be..a48d58d27 100644 --- a/c_check +++ b/c_check @@ -54,6 +54,7 @@ $compiler = GCC if ($compiler eq ""); $os = Linux if ($data =~ /OS_LINUX/); $os = FreeBSD if ($data =~ /OS_FREEBSD/); $os = NetBSD if ($data =~ /OS_NETBSD/); +$os = OpenBSD if ($data =~ /OS_OPENBSD/); $os = Darwin if ($data =~ /OS_DARWIN/); $os = SunOS if ($data =~ /OS_SUNOS/); $os = AIX if ($data =~ /OS_AIX/); diff --git a/common.h b/common.h index ae98279ef..79f15b89a 100644 --- a/common.h +++ b/common.h @@ -93,7 +93,7 @@ extern "C" { #include #endif -#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_ANDROID) +#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_ANDROID) #include #endif diff --git a/common_x86.h b/common_x86.h index 4363fb2f4..4cf783473 100644 --- a/common_x86.h +++ b/common_x86.h @@ -327,7 +327,7 @@ REALNAME: #endif #endif -#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(__ELF__) #define PROLOGUE \ .text; \ .align 16; \ diff --git a/common_x86_64.h b/common_x86_64.h index bee88d3ce..4ce2ef7bf 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -403,7 +403,7 @@ REALNAME: #define EPILOGUE .end #endif -#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(__ELF__) || defined(C_PGI) #define PROLOGUE \ .text; \ .align 512; \ diff --git a/ctest.c b/ctest.c index 27d3b473a..de289ccea 100644 --- a/ctest.c +++ b/ctest.c @@ -60,6 +60,10 @@ OS_FREEBSD OS_NETBSD #endif +#if defined(__OpenBSD__) +OS_OPENBSD +#endif + #if defined(__sun) OS_SUNOS #endif diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 2e0fe190d..863c58773 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*********************************************************************/ #include "common.h" -#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) +#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) #include #include #include diff --git a/driver/others/memory.c b/driver/others/memory.c index 474d97c4d..8efe8f086 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -108,7 +108,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif -#if defined(OS_FREEBSD) || defined(OS_DARWIN) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DARWIN) #include #include #endif @@ -246,7 +246,7 @@ int get_num_procs(void) { #endif -#if defined(OS_FREEBSD) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) int get_num_procs(void) { @@ -336,7 +336,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) int max_num; #endif int blas_goto_num = 0; @@ -344,7 +344,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) max_num = get_num_procs(); #endif @@ -368,7 +368,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif diff --git a/exports/Makefile b/exports/Makefile index 79c251d62..e5e203053 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -156,7 +156,7 @@ endif endif #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or -ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD)) +ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD)) so : ../$(LIBSONAME) diff --git a/getarch.c b/getarch.c index 24ea5fe5f..94c6ae6a4 100644 --- a/getarch.c +++ b/getarch.c @@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef OS_WINDOWS #include #endif -#if defined(__FreeBSD__) || defined(__APPLE__) +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__APPLE__) #include #include #endif @@ -1074,7 +1074,7 @@ static int get_num_cores(void) { #ifdef OS_WINDOWS SYSTEM_INFO sysinfo; -#elif defined(__FreeBSD__) || defined(__APPLE__) +#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__APPLE__) int m[2], count; size_t len; #endif @@ -1088,7 +1088,7 @@ static int get_num_cores(void) { GetSystemInfo(&sysinfo); return sysinfo.dwNumberOfProcessors; -#elif defined(__FreeBSD__) || defined(__APPLE__) +#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__APPLE__) m[0] = CTL_HW; m[1] = HW_NCPU; len = sizeof(int); From a41d241a0e9fe70c95e9ce1e406d5c57fd2d593b Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Tue, 3 Apr 2018 16:39:29 -0700 Subject: [PATCH 14/23] Add support for DragonFly BSD --- Makefile | 2 +- Makefile.install | 4 ++-- Makefile.system | 2 +- c_check | 1 + common.h | 2 +- common_x86_64.h | 2 +- ctest.c | 4 ++++ driver/others/blas_server.c | 2 +- driver/others/memory.c | 10 +++++----- getarch.c | 6 +++--- 10 files changed, 20 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index 7818b3cd9..c0e5fbcf8 100644 --- a/Makefile +++ b/Makefile @@ -91,7 +91,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) @ln -fs $(LIBSONAME) $(LIBPREFIX).so @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif -ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD)) +ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) @$(MAKE) -C exports so @ln -fs $(LIBSONAME) $(LIBPREFIX).so endif diff --git a/Makefile.install b/Makefile.install index e22c61da7..9ce5ceae6 100644 --- a/Makefile.install +++ b/Makefile.install @@ -72,7 +72,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif -ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD)) +ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so @@ -110,7 +110,7 @@ endif ifndef NO_SHARED #ifeq logical or -ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD)) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) diff --git a/Makefile.system b/Makefile.system index 062e14b54..d504a1111 100644 --- a/Makefile.system +++ b/Makefile.system @@ -230,7 +230,7 @@ endif MD5SUM = md5 -r endif -ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD)) +ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly)) MD5SUM = md5 -r endif diff --git a/c_check b/c_check index a48d58d27..a3b337602 100644 --- a/c_check +++ b/c_check @@ -55,6 +55,7 @@ $os = Linux if ($data =~ /OS_LINUX/); $os = FreeBSD if ($data =~ /OS_FREEBSD/); $os = NetBSD if ($data =~ /OS_NETBSD/); $os = OpenBSD if ($data =~ /OS_OPENBSD/); +$os = DragonFly if ($data =~ /OS_DRAGONFLY/); $os = Darwin if ($data =~ /OS_DARWIN/); $os = SunOS if ($data =~ /OS_SUNOS/); $os = AIX if ($data =~ /OS_AIX/); diff --git a/common.h b/common.h index 79f15b89a..5a599a5af 100644 --- a/common.h +++ b/common.h @@ -93,7 +93,7 @@ extern "C" { #include #endif -#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_ANDROID) +#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_ANDROID) #include #endif diff --git a/common_x86_64.h b/common_x86_64.h index 4ce2ef7bf..1cc71506a 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -403,7 +403,7 @@ REALNAME: #define EPILOGUE .end #endif -#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(__ELF__) || defined(C_PGI) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(__ELF__) || defined(C_PGI) #define PROLOGUE \ .text; \ .align 512; \ diff --git a/ctest.c b/ctest.c index de289ccea..00be423d1 100644 --- a/ctest.c +++ b/ctest.c @@ -64,6 +64,10 @@ OS_NETBSD OS_OPENBSD #endif +#if defined(__DragonFly__) +OS_DRAGONFLY +#endif + #if defined(__sun) OS_SUNOS #endif diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 863c58773..794dfb20e 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*********************************************************************/ #include "common.h" -#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) +#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) #include #include #include diff --git a/driver/others/memory.c b/driver/others/memory.c index 8efe8f086..6920efaaa 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -108,7 +108,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif -#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DARWIN) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) #include #include #endif @@ -246,7 +246,7 @@ int get_num_procs(void) { #endif -#if defined(OS_FREEBSD) || defined(OS_OPENBSD) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) int get_num_procs(void) { @@ -336,7 +336,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) int max_num; #endif int blas_goto_num = 0; @@ -344,7 +344,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) max_num = get_num_procs(); #endif @@ -368,7 +368,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif diff --git a/getarch.c b/getarch.c index 94c6ae6a4..992fc2b95 100644 --- a/getarch.c +++ b/getarch.c @@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef OS_WINDOWS #include #endif -#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__APPLE__) +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__) #include #include #endif @@ -1074,7 +1074,7 @@ static int get_num_cores(void) { #ifdef OS_WINDOWS SYSTEM_INFO sysinfo; -#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__APPLE__) +#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__) int m[2], count; size_t len; #endif @@ -1088,7 +1088,7 @@ static int get_num_cores(void) { GetSystemInfo(&sysinfo); return sysinfo.dwNumberOfProcessors; -#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__APPLE__) +#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__) m[0] = CTL_HW; m[1] = HW_NCPU; len = sizeof(int); From 33f838393cb3870723774d51afc27405fa2c6429 Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Tue, 3 Apr 2018 16:42:01 -0700 Subject: [PATCH 15/23] Add OpenBSD and DragonFly to community supported platforms --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index ec32c1f60..b5449a45e 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,8 @@ Please read GotoBLAS_01Readme.txt - **MingWin or Visual Studio(CMake)/Windows**: Please read . - **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. - **FreeBSD**: Supported by community. We didn't test the library on this OS. +- **OpenBSD**: Supported by community. We didn't test the library on this OS. +- **DragonFly BSD**: Supported by community. We didn't test the library on this OS. - **Android**: Supported by community. Please read . ## Usages From bb9876db33952cf9e2636edda50b6cb0eb6f5912 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 4 Apr 2018 18:16:52 +0200 Subject: [PATCH 16/23] Fix thread races and infinite looping on systems with many cpus On systems with more than 64 cpus, blas_quickdivide will sometimes return zero which creates bogus workloads when used for the stride calculation. This then leads to threads spinning incessantly waiting for a status change that never happens, as seen in #1497. This patch also fixes several data races that were found by helgrind and/or tsan while debugging the issue. --- lapack/getrf/getrf_parallel.c | 96 ++++++++++++++++++++++++++++++++--- 1 file changed, 88 insertions(+), 8 deletions(-) diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index db8c836e0..91d97a791 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -67,6 +67,26 @@ double sqrt(double); #undef GETRF_FACTOR #define GETRF_FACTOR 1.00 + +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t getrf_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t getrf_lock = 0; +#else +static BLASULONG getrf_lock = 0UL; +#endif + +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t getrf_flag_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t getrf_flag_lock = 0; +#else +static BLASULONG getrf_flag_lock = 0UL; +#endif + + + + static __inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) { double m = (double)(M - IS - BK); @@ -217,7 +237,10 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * blasint *ipiv = (blasint *)args -> c; - volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; + //_Atomic + BLASLONG jw; + + _Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d; if (args -> a == NULL) { TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); @@ -245,8 +268,20 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { for (i = 0; i < args -> nthreads; i++) +#if 1 + { + LOCK_COMMAND(&getrf_lock); + jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside]; + UNLOCK_COMMAND(&getrf_lock); + do { + LOCK_COMMAND(&getrf_lock); + jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside]; + UNLOCK_COMMAND(&getrf_lock); + } while (jw); + } +#else while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {}; - +#endif for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; @@ -283,18 +318,23 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * b + (is + jjs * lda) * COMPSIZE, lda, is); } } - MB; - for (i = 0; i < args -> nthreads; i++) + for (i = 0; i < args -> nthreads; i++) { +LOCK_COMMAND(&getrf_lock); job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; - +UNLOCK_COMMAND(&getrf_lock); + } } +LOCK_COMMAND(&getrf_flag_lock); flag[mypos * CACHE_LINE_SIZE] = 0; +UNLOCK_COMMAND(&getrf_flag_lock); if (m == 0) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { +LOCK_COMMAND(&getrf_lock); job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0; +UNLOCK_COMMAND(&getrf_lock); } } @@ -318,7 +358,18 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { if ((current != mypos) && (!is)) { +#if 1 + LOCK_COMMAND(&getrf_lock); + jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside]; + UNLOCK_COMMAND(&getrf_lock); + do { + LOCK_COMMAND(&getrf_lock); + jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside]; + UNLOCK_COMMAND(&getrf_lock); + } while (jw == 0); +#else while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {}; +#endif } KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, @@ -327,7 +378,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * MB; if (is + min_i >= m) { +LOCK_COMMAND(&getrf_lock); job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; +UNLOCK_COMMAND(&getrf_lock); } } @@ -339,7 +392,18 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * for (i = 0; i < args -> nthreads; i++) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { +#if 1 + LOCK_COMMAND(&getrf_lock); + jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx]; + UNLOCK_COMMAND(&getrf_lock); + do { + LOCK_COMMAND(&getrf_lock); + jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx]; + UNLOCK_COMMAND(&getrf_lock); + } while(jw != 0); +#else while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {}; +#endif } } @@ -374,6 +438,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, BLASLONG i, j, k, is, bk; BLASLONG num_cpu; + BLASLONG f; #ifdef _MSC_VER BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE]; @@ -501,11 +566,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, if (mm >= nn) { width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); + if (width == 0) width = nn; if (nn < width) width = nn; nn -= width; range_N[num_cpu + 1] = range_N[num_cpu] + width; width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); + if (width == 0) width = mm; if (mm < width) width = mm; if (nn <= 0) width = mm; mm -= width; @@ -514,11 +581,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, } else { width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); + if (width == 0) width = mm; if (mm < width) width = mm; mm -= width; range_M[num_cpu + 1] = range_M[num_cpu] + width; width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); + if (width == 0) width = nn; if (nn < width) width = nn; if (mm <= 0) width = nn; nn -= width; @@ -561,7 +630,6 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, range_n_new[1] = offset + is + bk; if (num_cpu > 0) { - queue[num_cpu - 1].next = NULL; exec_blas_async(0, &queue[0]); @@ -572,8 +640,20 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, if (iinfo && !info) info = iinfo + is; - for (i = 0; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {}; - + for (i = 0; i < num_cpu; i ++) { +#if 1 + LOCK_COMMAND(&getrf_flag_lock); + f=flag[i*CACHE_LINE_SIZE]; + UNLOCK_COMMAND(&getrf_flag_lock); + while (f!=0) { + LOCK_COMMAND(&getrf_flag_lock); + f=flag[i*CACHE_LINE_SIZE]; + UNLOCK_COMMAND(&getrf_flag_lock); + }; +#else + while (flag[i*CACHE_LINE_SIZE]) {}; +#endif + } TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb); } else { From 36a17536ca739cea2c773a478b7bc0688cd59434 Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Tue, 3 Apr 2018 15:09:25 -0700 Subject: [PATCH 17/23] Compile with cc rather than gcc whenever possible --- Makefile.system | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/Makefile.system b/Makefile.system index d504a1111..769628e98 100644 --- a/Makefile.system +++ b/Makefile.system @@ -17,16 +17,20 @@ NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib # http://stackoverflow.com/questions/4029274/mingw-and-make-variables # - Default value is 'cc' which is not always a valid command (e.g. MinGW). ifeq ($(origin CC),default) + +# Check if $(CC) refers to a valid command and set the value to gcc if not +ifneq ($(findstring cmd.exe,$(SHELL)),) +ifeq ($(shell where $(CC) 2>NUL),) +CC = gcc +endif +else # POSIX +ifeq ($(shell command -v $(CC) 2>/dev/null),) CC = gcc -# Change the default compile to clang on Mac OSX. -# http://stackoverflow.com/questions/714100/os-detecting-makefile -UNAME_S := $(shell uname -s) -ifeq ($(UNAME_S),Darwin) - CC = clang -# EXTRALIB += -Wl,-no_compact_unwind endif endif +endif # CC is set to default + # Default Fortran compiler (FC) is selected by f_check. ifndef MAKEFILE_RULE From 8f811a9312f6692c084c25fa78c45827accb7103 Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Wed, 4 Apr 2018 11:41:45 -0700 Subject: [PATCH 18/23] Reinstate macOS logic --- Makefile.system | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/Makefile.system b/Makefile.system index 769628e98..142cb420f 100644 --- a/Makefile.system +++ b/Makefile.system @@ -23,11 +23,16 @@ ifneq ($(findstring cmd.exe,$(SHELL)),) ifeq ($(shell where $(CC) 2>NUL),) CC = gcc endif -else # POSIX +else # POSIX-ish ifeq ($(shell command -v $(CC) 2>/dev/null),) +ifeq ($(shell uname -s),Darwin) +CC = clang +# EXTRALIB += -Wl,-no_compact_unwind +else CC = gcc -endif -endif +endif # Darwin +endif # CC exists +endif # Shell is sane endif # CC is set to default From ca8ca796d3d6d35b33f879d3af75567fcb7348c5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 4 Apr 2018 22:26:51 +0200 Subject: [PATCH 19/23] Underline importance of NUM_THREADS setting for BUFFER allocation following augray's suggestion from #1451, and incorporating ashwinyes' comments from #1141 on the importance of NUM_THREADS even for single-threaded builds. --- USAGE.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/USAGE.md b/USAGE.md index c76ceb324..89f3bba67 100644 --- a/USAGE.md +++ b/USAGE.md @@ -14,6 +14,20 @@ Please build OpenBLAS with larger `NUM_THREADS`. For example, `make NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set `MAX_CPU_NUMBER=NUM_THREADS`. +Despite its name, and due to the use of memory buffers in functions like SGEMM, +the setting of NUM_THREADS can be relevant even for a single-threaded build +of OpenBLAS, if such functions get called by multiple threads of a program +that uses OpenBLAS. In some cases, the affected code may simply crash or throw +a segmentation fault without displaying the above warning first. + +Note that the number of threads used at runtime can be altered to differ from the +value NUM_THREADS was set to at build time. At runtime, the actual number of +threads can be set anywhere from 1 to the build's NUM_THREADS (note however, +that this does not change the number of memory buffers that will be allocated, +which is set at build time). The number of threads for a process can be set by +using the mechanisms described below. + + #### How can I use OpenBLAS in multi-threaded applications? If your application is already multi-threaded, it will conflict with OpenBLAS From 8ec28ff4619b8e6fc2c88543f2902e8f95948ae0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 4 Apr 2018 22:40:30 +0200 Subject: [PATCH 20/23] Remove unguarded use of _Atomic and fix tabbing --- lapack/getrf/getrf_parallel.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index 91d97a791..b48765e55 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -237,10 +237,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * blasint *ipiv = (blasint *)args -> c; - //_Atomic BLASLONG jw; - _Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d; + volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; if (args -> a == NULL) { TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); @@ -320,21 +319,21 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * } MB; for (i = 0; i < args -> nthreads; i++) { -LOCK_COMMAND(&getrf_lock); + LOCK_COMMAND(&getrf_lock); job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; -UNLOCK_COMMAND(&getrf_lock); + UNLOCK_COMMAND(&getrf_lock); } } -LOCK_COMMAND(&getrf_flag_lock); + LOCK_COMMAND(&getrf_flag_lock); flag[mypos * CACHE_LINE_SIZE] = 0; -UNLOCK_COMMAND(&getrf_flag_lock); + UNLOCK_COMMAND(&getrf_flag_lock); if (m == 0) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { -LOCK_COMMAND(&getrf_lock); + LOCK_COMMAND(&getrf_lock); job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0; -UNLOCK_COMMAND(&getrf_lock); + UNLOCK_COMMAND(&getrf_lock); } } @@ -378,9 +377,9 @@ UNLOCK_COMMAND(&getrf_lock); MB; if (is + min_i >= m) { -LOCK_COMMAND(&getrf_lock); + LOCK_COMMAND(&getrf_lock); job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; -UNLOCK_COMMAND(&getrf_lock); + UNLOCK_COMMAND(&getrf_lock); } } From 137ccd9dd96468ce26cea78ba75a70a7f3c73079 Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Wed, 4 Apr 2018 14:30:32 -0700 Subject: [PATCH 21/23] Minor changes to wording and formatting in the README The wording in some places is not grammatically correct. This change also provides minor adjustments to the Markdown formatting which provide modest improvements to readability. --- README.md | 232 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 137 insertions(+), 95 deletions(-) diff --git a/README.md b/README.md index b5449a45e..02d087334 100644 --- a/README.md +++ b/README.md @@ -5,177 +5,219 @@ Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS) AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) + ## Introduction + OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. -Please read the documents on OpenBLAS wiki pages . +Please read the documentation on the OpenBLAS wiki pages: . ## Binary Packages -We provide binary packages for the following platform. + +We provide official binary packages for the following platform: * Windows x86/x86_64 You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/). ## Installation from Source -Download from project homepage. http://xianyi.github.com/OpenBLAS/ -Or, check out codes from git://github.com/xianyi/OpenBLAS.git +Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code +using Git from https://github.com/xianyi/OpenBLAS.git. + +### Dependencies + +Building OpenBLAS requires the following to be installed: + +* GNU Make +* A C compiler, e.g. GCC or Clang +* A Fortran compiler (optional, for LAPACK) +* IBM MASS (optional, see below) + ### Normal compile - * type "make" to detect the CPU automatically. - or - * type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. + +Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically. +To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`. +The full target list is in the file `TargetList.txt`. ### Cross compile -Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly. + +Set `CC` and `FC` to point to the cross toolchains, and set `HOSTCC` to your host C compiler. +The target must be specified explicitly when cross compiling. Examples: -On X86 box, compile this library for loongson3a CPU. +* On an x86 box, compile this library for a loongson3a CPU: + ```sh + make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A + ``` - make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A - -On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler. - - make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32 +* On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler: + ```sh + make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32 + ``` ### Debug version - make DEBUG=1 +A debug version can be built using `make DEBUG=1`. -### Compile with MASS Support on Power CPU (Optional dependency) +### Compile with MASS support on Power CPU (optional) -[IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library consists of a set of mathematical functions for C, C++, and -Fortran-language applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires 64-bit, little-endian OS on POWER. -The library can be installed as below - +The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library +consists of a set of mathematical functions for C, C++, and Fortran applications that are +are tuned for optimum performance on POWER architectures. +OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER. +The library can be installed as shown: - * On Ubuntu: +* On Ubuntu: + ```sh + wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add - + echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list + sudo apt-get update + sudo apt-get install libxlmass-devel.8.1.5 + ``` - wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -
- echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list
- sudo apt-get update
- sudo apt-get install libxlmass-devel.8.1.5
+* On RHEL/CentOS: + ```sh + wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key + sudo rpm --import repomd.xml.key + wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo + sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/ + sudo yum install libxlmass-devel.8.1.5 + ``` - * On RHEL/CentOS: +After installing the MASS library, compile OpenBLAS with `USE_MASS=1`. +For example, to compile on Power8 with MASS support: `make USE_MASS=1 TARGET=POWER8`. - wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key
- sudo rpm --import repomd.xml.key
- wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo
- sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/
- sudo yum install libxlmass-devel.8.1.5
+### Install to a specific directory (optional) -After installing MASS library, compile openblas with USE_MASS=1. +Use `PREFIX=` when invoking `make`, for example -Example: +```sh +make install PREFIX=your_installation_directory +``` -Compiling on Power8 with MASS support - +The default installation directory is `/opt/OpenBLAS`. - make USE_MASS=1 TARGET=POWER8 +## Supported CPUs and Operating Systems -### Install to the directory (optional) +Please read `GotoBLAS_01Readme.txt`. -Example: +### Additional supported CPUs - make install PREFIX=your_installation_directory +#### x86/x86-64 -The default directory is /opt/OpenBLAS - -## Support CPU & OS -Please read GotoBLAS_01Readme.txt - -### Additional support CPU: - -#### x86/x86-64: - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. -- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar) +- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. -#### MIPS64: +#### MIPS64 + - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. - **ICT Loongson 3B**: Experimental -#### ARM: -- **ARMV6**: Optimized BLAS for vfpv2 and vfpv3-d16 ( e.g. BCM2835, Cortex M0+ ) -- **ARMV7**: Optimized BLAS for vfpv3-d32 ( e.g. Cortex A8, A9 and A15 ) +#### ARM -#### ARM64: -- **ARMV8**: Experimental +- **ARMv6**: Optimized BLAS for vfpv2 and vfpv3-d16 (e.g. BCM2835, Cortex M0+) +- **ARMv7**: Optimized BLAS for vfpv3-d32 (e.g. Cortex A8, A9 and A15) + +#### ARM64 + +- **ARMv8**: Experimental - **ARM Cortex-A57**: Experimental #### PPC/PPC64 -- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with USE_OPENMP=1 -#### IBM zEnterprise System: +- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1` + +#### IBM zEnterprise System + - **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) - -### Support OS: +### Supported OS + - **GNU/Linux** -- **MingWin or Visual Studio(CMake)/Windows**: Please read . -- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. -- **FreeBSD**: Supported by community. We didn't test the library on this OS. -- **OpenBSD**: Supported by community. We didn't test the library on this OS. -- **DragonFly BSD**: Supported by community. We didn't test the library on this OS. -- **Android**: Supported by community. Please read . +- **MinGW or Visual Studio (CMake)/Windows**: Please read . +- **Darwin/macOS**: Experimental. Although GotoBLAS2 supports Darwin, we are not macOS experts. +- **FreeBSD**: Supported by the community. We don't actively test the library on this OS. +- **OpenBSD**: Supported by the community. We don't actively test the library on this OS. +- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS. +- **Android**: Supported by the community. Please read . -## Usages -Link with libopenblas.a or -lopenblas for shared library. +## Usage -### Set the number of threads with environment variables. +Statically link with `libopenblas.a` or dynamically link with `-lopenblas` if OpenBLAS was +compiled as a shared library. -Examples: +### Setting the number of threads using environment variables - export OPENBLAS_NUM_THREADS=4 +Environment variables are used to specify a maximum number of threads. +For example, - or +```sh +export OPENBLAS_NUM_THREADS=4 +export GOTO_NUM_THREADS=4 +export OMP_NUM_THREADS=4 +``` - export GOTO_NUM_THREADS=4 +The priorities are `OPENBLAS_NUM_THREADS` > `GOTO_NUM_THREADS` > `OMP_NUM_THREADS`. - or +If you compile this library with `USE_OPENMP=1`, you should set the `OMP_NUM_THREADS` +environment variable; OpenBLAS ignores `OPENBLAS_NUM_THREADS` and `GOTO_NUM_THREADS` when +compiled with `USE_OPENMP=1`. - export OMP_NUM_THREADS=4 +### Setting the number of threads at runtime -The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. +We provide the following functions to control the number of threads at runtime: -If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. +```c +void goto_set_num_threads(int num_threads); +void openblas_set_num_threads(int num_threads); +``` -### Set the number of threads on runtime. +If you compile this library with `USE_OPENMP=1`, you should use the above functions too. -We provided the below functions to control the number of threads on runtime. +## Reporting bugs - void goto_set_num_threads(int num_threads); - - void openblas_set_num_threads(int num_threads); - -If you compile this lib with USE_OPENMP=1, you should use the above functions, too. - -## Report Bugs -Please add a issue in https://github.com/xianyi/OpenBLAS/issues +Please submit an issue in https://github.com/xianyi/OpenBLAS/issues. ## Contact + * OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users * OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev -## ChangeLog -Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. +## Change log + +Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 1.13 BSD version. ## Troubleshooting -* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first. -* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. -* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. -* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1. -* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). -* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. + +* Please read the [FAQ](https://github.com/xianyi/OpenBLAS/wiki/Faq) first. +* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. +* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. + Clang 3.0 will generate the wrong AVX binary code. +* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`), + there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build + the library with `BIGNUMA=1`. +* OpenBLAS does not set processor affinity by default. + On Linux, you can enable processor affinity by commenting out the line `NO_AFFINITY=1` in + Makefile.rule. However, note that this may cause + [a conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). +* On Loongson 3A, `make test` may fail with a `pthread_create` error (`EAGAIN`). + However, it will be okay when you run the same test case on the shell. ## Contributing -1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug. -1. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes. -1. Write a test which shows that the bug was fixed or that the feature works as expected. -1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`. + +1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue + to start a discussion around a feature idea or a bug. +2. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes. +3. Write a test which shows that the bug was fixed or that the feature works as expected. +4. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`. ## Donation + Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation). From 24f8d5b62413543148ea6f9a44cac875f95a6387 Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Fri, 6 Apr 2018 17:30:10 -0700 Subject: [PATCH 22/23] Add DragonFly to exports/Makefile Its exclusion was an oversight on my part. --- exports/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exports/Makefile b/exports/Makefile index e5e203053..53d4f75bb 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -156,7 +156,7 @@ endif endif #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or -ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD)) +ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) so : ../$(LIBSONAME) From 6a0930560e6fc5dc4ce204cf11ca8f9818c7fddc Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Fri, 6 Apr 2018 17:53:58 -0700 Subject: [PATCH 23/23] Add macOS to the Travis testing matrix --- .travis.yml | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 0b280c2fc..e599c75e7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,6 +7,7 @@ language: c jobs: include: - &test-ubuntu + os: linux stage: test compiler: gcc addons: @@ -57,7 +58,8 @@ jobs: - TARGET_BOX=LINUX32 - BTYPE="BINARY=32" - - stage: test + - os: linux + stage: test compiler: gcc addons: apt: @@ -77,6 +79,7 @@ jobs: # which is slower than container-based infrastructure used for jobs # that don't require sudo. - &test-alpine + os: linux stage: test dist: trusty sudo: true @@ -120,6 +123,7 @@ jobs: - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2" - &test-cmake + os: linux stage: test compiler: clang addons: @@ -147,6 +151,17 @@ jobs: env: - CMAKE=1 + - os: osx + stage: test + osx_image: xcode8 + before_script: *common-before + - brew update + - brew install gcc # for gfortran + script: + - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + env: + - BTYPE="BINARY=64 INTERFACE64=1" + # whitelist branches: only: