Merge pull request #3014 from RajalakshmiSR/dgemvnp10
POWER10: Optimize dgemv_n
This commit is contained in:
commit
5e81e81478
|
@ -25,14 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/30 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
|
||||
static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha)
|
||||
|
@ -266,3 +258,145 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
|||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
|
||||
);
|
||||
}
|
||||
static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y, double alpha)
|
||||
{
|
||||
|
||||
double *a0;
|
||||
double *a1;
|
||||
double *a2;
|
||||
double *a3;
|
||||
double *a4;
|
||||
double *a5;
|
||||
double *a6;
|
||||
double *a7;
|
||||
long tmp;
|
||||
__asm__
|
||||
(
|
||||
"lxvp 34, 0( %15) \n\t" // x0, x1
|
||||
"lxvp 38, 32( %15) \n\t" // x4, x5
|
||||
|
||||
XXSPLTD_S(58,%x14,0) // alpha, alpha
|
||||
"sldi %10, %17, 3 \n\t" // lda * sizeof (double)
|
||||
"xvmuldp 34, 34, 58 \n\t" // x0 * alpha, x1 * alpha
|
||||
"xvmuldp 35, 35, 58 \n\t" // x2 * alpha, x3 * alpha
|
||||
"xvmuldp 38, 38, 58 \n\t" // x4 * alpha, x5 * alpha
|
||||
"xvmuldp 39, 39, 58 \n\t" // x6 * alpha, x7 * alpha
|
||||
|
||||
"li %11, 32 \n\t"
|
||||
|
||||
"add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda
|
||||
"add %10, %10, %10 \n\t" // 2 * lda
|
||||
XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha
|
||||
XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha
|
||||
XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha
|
||||
XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha
|
||||
XXSPLTD_S(48,39,1) // x6 * alpha, x6 * alpha
|
||||
XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha
|
||||
XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha
|
||||
XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha
|
||||
|
||||
"add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda
|
||||
"add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda
|
||||
"add %7, %5, %10 \n\t" // a4 = a2 + 2 * lda
|
||||
"add %8, %6, %10 \n\t" // a5 = a3 + 2 * lda
|
||||
"add %9, %7, %10 \n\t" // a6 = a4 + 2 * lda
|
||||
"add %10, %8, %10 \n\t" // a7 = a5 + 2 * lda
|
||||
|
||||
"lxvp 40, 0( %3) \n\t" // a0[0], a0[1]
|
||||
"lxvp 42, 0( %4) \n\t" // a1[0], a1[1]
|
||||
"lxvp 44, 0( %5) \n\t" // a2[0], a2[1]
|
||||
"lxvp 46, 0( %6) \n\t" // a3[0], a3[1]
|
||||
"lxvp 50, 0( %7) \n\t" // a4[0]
|
||||
"lxvp 52, 0( %8) \n\t" // a5[0]
|
||||
"lxvp 54, 0( %9) \n\t" // a6[0]
|
||||
"lxvp 56, 0( %10) \n\t" // a7[0]
|
||||
|
||||
|
||||
"addic. %1, %1, -4 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"lxvp 36, 0( %2) \n\t" // y0, y1
|
||||
|
||||
"xvmaddadp 36, 40, 34 \n\t"
|
||||
"xvmaddadp 37, 41, 34 \n\t"
|
||||
"lxvpx 40, %3, %11 \n\t" // a0[0], a0[1]
|
||||
"xvmaddadp 36, 42, 35 \n\t"
|
||||
"xvmaddadp 37, 43, 35 \n\t"
|
||||
"lxvpx 42, %4, %11 \n\t" // a1[0], a1[1]
|
||||
"xvmaddadp 36, 44, 32 \n\t"
|
||||
"xvmaddadp 37, 45, 32 \n\t"
|
||||
"lxvpx 44, %5, %11 \n\t" // a2[0], a2[1]
|
||||
"xvmaddadp 36, 46, 33 \n\t"
|
||||
"xvmaddadp 37, 47, 33 \n\t"
|
||||
"lxvpx 46, %6, %11 \n\t" // a3[0], a3[1]
|
||||
"xvmaddadp 36, 50, 48 \n\t"
|
||||
"xvmaddadp 37, 51, 48 \n\t"
|
||||
"lxvpx 50, %7, %11 \n\t" // a4[0]
|
||||
"xvmaddadp 36, 52, 49 \n\t"
|
||||
"xvmaddadp 37, 53, 49 \n\t"
|
||||
"lxvpx 52, %8, %11 \n\t" // a5[0]
|
||||
"xvmaddadp 36, 54, 38 \n\t"
|
||||
"xvmaddadp 37, 55, 38 \n\t"
|
||||
"lxvpx 54, %9, %11 \n\t" // a6[0]
|
||||
"xvmaddadp 36, 56, 39 \n\t"
|
||||
"xvmaddadp 37, 57, 39 \n\t"
|
||||
"lxvpx 56, %10, %11 \n\t" // a7[0]
|
||||
"addi %11, %11, 32 \n\t"
|
||||
|
||||
"stxvp 36, 0( %2) \n\t" // y0, y1
|
||||
"addi %2, %2, 32 \n\t"
|
||||
|
||||
"addic. %1, %1, -4 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"lxvp 36, 0( %2) \n\t" // y0, y1
|
||||
"xvmaddadp 36, 40, 34 \n\t"
|
||||
"xvmaddadp 37, 41, 34 \n\t"
|
||||
"xvmaddadp 36, 42, 35 \n\t"
|
||||
"xvmaddadp 37, 43, 35 \n\t"
|
||||
"xvmaddadp 36, 44, 32 \n\t"
|
||||
"xvmaddadp 37, 45, 32 \n\t"
|
||||
"xvmaddadp 36, 46, 33 \n\t"
|
||||
"xvmaddadp 37, 47, 33 \n\t"
|
||||
"xvmaddadp 36, 50, 48 \n\t"
|
||||
"xvmaddadp 37, 51, 48 \n\t"
|
||||
"xvmaddadp 36, 52, 49 \n\t"
|
||||
"xvmaddadp 37, 53, 49 \n\t"
|
||||
"xvmaddadp 36, 54, 38 \n\t"
|
||||
"xvmaddadp 37, 55, 38 \n\t"
|
||||
"xvmaddadp 36, 56, 39 \n\t"
|
||||
"xvmaddadp 37, 57, 39 \n\t"
|
||||
"stxvp 36, 0( %2) \n\t" // y0, y1
|
||||
|
||||
:
|
||||
"+m" (*y),
|
||||
"+r" (n), // 1
|
||||
"+b" (y), // 2
|
||||
"=b" (a0), // 3
|
||||
"=b" (a1), // 4
|
||||
"=&b" (a2), // 5
|
||||
"=&b" (a3), // 6
|
||||
"=&b" (a4), // 7
|
||||
"=&b" (a5), // 8
|
||||
"=&b" (a6), // 9
|
||||
"=&b" (a7), // 10
|
||||
"=b" (tmp)
|
||||
:
|
||||
"m" (*x),
|
||||
"m" (*ap),
|
||||
"d" (alpha), // 14
|
||||
"r" (x), // 15
|
||||
"3" (ap), // 16
|
||||
"4" (lda) // 17
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48",
|
||||
"vs49","vs50","vs51","vs52","vs53","vs54","vs55","vs56", "vs57", "vs58"
|
||||
);
|
||||
}
|
||||
|
|
|
@ -26,165 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <altivec.h>
|
||||
|
||||
typedef __vector unsigned char vec_t;
|
||||
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
|
||||
typedef __vector_pair __attribute__((aligned(8))) vecp_t;
|
||||
|
||||
#include "dgemv_n_microk_power10.c"
|
||||
|
||||
#define MMA(X, APTR, ACC) \
|
||||
rX = (vec_t *) & X; \
|
||||
rowA = *((vecp_t*)((void*)&APTR)); \
|
||||
__builtin_mma_xvf64gerpp (ACC, rowA, rX[0]);
|
||||
|
||||
#define SAVE(ACC, Z) \
|
||||
rowC = (v4sf_t *) &y[Z]; \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
result[0][1] = result[1][0]; \
|
||||
result[2][1] = result[3][0]; \
|
||||
rowC[0] += valpha * result[0]; \
|
||||
rowC[1] += valpha * result[2];
|
||||
|
||||
void
|
||||
dgemv_kernel_4x128 (BLASLONG n, FLOAT * a_ptr, BLASLONG lda, FLOAT * xo,
|
||||
FLOAT * y, FLOAT alpha)
|
||||
{
|
||||
BLASLONG i, j, tmp;
|
||||
FLOAT *a0 = a_ptr;
|
||||
FLOAT *x1 = xo;
|
||||
vector double valpha = { alpha, alpha };
|
||||
v4sf_t *rowC;
|
||||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||
v4sf_t result[4];
|
||||
vecp_t rowA;
|
||||
vec_t *rX;
|
||||
tmp = (n / 32) * 32;
|
||||
for (i = 0; i < tmp; i += 32)
|
||||
{
|
||||
xo = x1;
|
||||
a0 = a_ptr;
|
||||
__builtin_mma_xxsetaccz (&acc0);
|
||||
__builtin_mma_xxsetaccz (&acc1);
|
||||
__builtin_mma_xxsetaccz (&acc2);
|
||||
__builtin_mma_xxsetaccz (&acc3);
|
||||
__builtin_mma_xxsetaccz (&acc4);
|
||||
__builtin_mma_xxsetaccz (&acc5);
|
||||
__builtin_mma_xxsetaccz (&acc6);
|
||||
__builtin_mma_xxsetaccz (&acc7);
|
||||
for (j = 0; j < 32; j++)
|
||||
{
|
||||
__builtin_prefetch (xo+j);
|
||||
__builtin_prefetch (a0+i+j+lda);
|
||||
MMA (xo[j], a0[i + 0 + j * lda], &acc0);
|
||||
MMA (xo[j], a0[i + 4 + j * lda], &acc1);
|
||||
MMA (xo[j], a0[i + 8 + j * lda], &acc2);
|
||||
MMA (xo[j], a0[i + 12 + j * lda], &acc3);
|
||||
MMA (xo[j], a0[i + 16 + j * lda], &acc4);
|
||||
MMA (xo[j], a0[i + 20 + j * lda], &acc5);
|
||||
MMA (xo[j], a0[i + 24 + j * lda], &acc6);
|
||||
MMA (xo[j], a0[i + 28 + j * lda], &acc7);
|
||||
}
|
||||
xo += 32;
|
||||
a0 += lda << 5;
|
||||
for (j = 0; j < 32; j++)
|
||||
{
|
||||
__builtin_prefetch (xo+j);
|
||||
__builtin_prefetch (a0+i+j+lda);
|
||||
MMA (xo[j], a0[i + 0 + j * lda], &acc0);
|
||||
MMA (xo[j], a0[i + 4 + j * lda], &acc1);
|
||||
MMA (xo[j], a0[i + 8 + j * lda], &acc2);
|
||||
MMA (xo[j], a0[i + 12 + j * lda], &acc3);
|
||||
MMA (xo[j], a0[i + 16 + j * lda], &acc4);
|
||||
MMA (xo[j], a0[i + 20 + j * lda], &acc5);
|
||||
MMA (xo[j], a0[i + 24 + j * lda], &acc6);
|
||||
MMA (xo[j], a0[i + 28 + j * lda], &acc7);
|
||||
}
|
||||
xo += 32;
|
||||
a0 += lda << 5;
|
||||
for (j = 0; j < 32; j++)
|
||||
{
|
||||
__builtin_prefetch (xo+j);
|
||||
__builtin_prefetch (a0+i+j+lda);
|
||||
MMA (xo[j], a0[i + 0 + j * lda], &acc0);
|
||||
MMA (xo[j], a0[i + 4 + j * lda], &acc1);
|
||||
MMA (xo[j], a0[i + 8 + j * lda], &acc2);
|
||||
MMA (xo[j], a0[i + 12 + j * lda], &acc3);
|
||||
MMA (xo[j], a0[i + 16 + j * lda], &acc4);
|
||||
MMA (xo[j], a0[i + 20 + j * lda], &acc5);
|
||||
MMA (xo[j], a0[i + 24 + j * lda], &acc6);
|
||||
MMA (xo[j], a0[i + 28 + j * lda], &acc7);
|
||||
}
|
||||
xo += 32;
|
||||
a0 += lda << 5;
|
||||
for (j = 0; j < 32; j++)
|
||||
{
|
||||
__builtin_prefetch (xo+j);
|
||||
__builtin_prefetch (a0+i+j+lda);
|
||||
MMA (xo[j], a0[i + 0 + j * lda], &acc0);
|
||||
MMA (xo[j], a0[i + 4 + j * lda], &acc1);
|
||||
MMA (xo[j], a0[i + 8 + j * lda], &acc2);
|
||||
MMA (xo[j], a0[i + 12 + j * lda], &acc3);
|
||||
MMA (xo[j], a0[i + 16 + j * lda], &acc4);
|
||||
MMA (xo[j], a0[i + 20 + j * lda], &acc5);
|
||||
MMA (xo[j], a0[i + 24 + j * lda], &acc6);
|
||||
MMA (xo[j], a0[i + 28 + j * lda], &acc7);
|
||||
}
|
||||
xo += 32;
|
||||
a0 += lda << 5;
|
||||
SAVE (&acc0, i + 0);
|
||||
SAVE (&acc1, i + 4);
|
||||
SAVE (&acc2, i + 8);
|
||||
SAVE (&acc3, i + 12);
|
||||
SAVE (&acc4, i + 16);
|
||||
SAVE (&acc5, i + 20);
|
||||
SAVE (&acc6, i + 24);
|
||||
SAVE (&acc7, i + 28);
|
||||
|
||||
}
|
||||
for (i = tmp; i < n; i += 4)
|
||||
{
|
||||
xo = x1;
|
||||
a0 = a_ptr;
|
||||
__builtin_mma_xxsetaccz (&acc0);
|
||||
for (j = 0; j < 32; j++)
|
||||
{
|
||||
__builtin_prefetch (xo+j);
|
||||
__builtin_prefetch (a0+i+j+lda);
|
||||
MMA (xo[j], a0[i + j * lda], &acc0);
|
||||
}
|
||||
xo += 32;
|
||||
a0 += lda << 5;
|
||||
for (j = 0; j < 32; j++)
|
||||
{
|
||||
__builtin_prefetch (xo+j);
|
||||
__builtin_prefetch (a0+i+j+lda);
|
||||
MMA (xo[j], a0[i + j * lda], &acc0);
|
||||
}
|
||||
xo += 32;
|
||||
a0 += lda << 5;
|
||||
for (j = 0; j < 32; j++)
|
||||
{
|
||||
__builtin_prefetch (xo+j);
|
||||
__builtin_prefetch (a0+i+j+lda);
|
||||
MMA (xo[j], a0[i + j * lda], &acc0);
|
||||
}
|
||||
xo += 32;
|
||||
a0 += lda << 5;
|
||||
for (j = 0; j < 32; j++)
|
||||
{
|
||||
__builtin_prefetch (xo+j);
|
||||
__builtin_prefetch (a0+i+j+lda);
|
||||
MMA (xo[j], a0[i + j * lda], &acc0);
|
||||
}
|
||||
xo += 32;
|
||||
a0 += lda << 5;
|
||||
SAVE (&acc0, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#define NBMAX 4096
|
||||
|
||||
#ifndef HAVE_KERNEL_4x4
|
||||
|
@ -281,13 +125,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
|||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
BLASLONG lda4 = lda << 2;
|
||||
BLASLONG lda128 = lda << 7;
|
||||
BLASLONG lda8 = lda << 3;
|
||||
|
||||
FLOAT xbuffer[8] __attribute__ ((aligned (16)));
|
||||
FLOAT *ybuffer;
|
||||
|
@ -296,9 +139,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
|||
if ( n < 1 ) return(0);
|
||||
|
||||
ybuffer = buffer;
|
||||
BLASLONG n128 = n >> 7;
|
||||
n1 = (n - (n128 * 128)) >> 2;
|
||||
n2 = (n - (n128 * 128)) & 3;
|
||||
BLASLONG n8 = n >> 3;
|
||||
n2 = n & 3;
|
||||
|
||||
m3 = m & 3 ;
|
||||
m1 = m & -4 ;
|
||||
|
@ -329,14 +171,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
|||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < n128 ; i++)
|
||||
for( i = 0; i < n8 ; i++)
|
||||
{
|
||||
dgemv_kernel_4x128(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
|
||||
a_ptr += lda128;
|
||||
x_ptr += 128;
|
||||
dgemv_kernel_4x8(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
|
||||
a_ptr += lda8;
|
||||
x_ptr += 8;
|
||||
}
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
if( n & 4 )
|
||||
{
|
||||
dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
|
||||
a_ptr += lda4;
|
||||
|
@ -363,20 +205,19 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
|||
}
|
||||
else
|
||||
{
|
||||
for( i = 0; i < n128 ; i++)
|
||||
for( i = 0; i < n8 ; i++)
|
||||
{
|
||||
FLOAT xbuffer[128] __attribute__ ((aligned (16)));
|
||||
BLASLONG j;
|
||||
for ( j = 0; j < 128 ; j++)
|
||||
for ( j = 0; j < 8 ; j++)
|
||||
{
|
||||
xbuffer[j] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
dgemv_kernel_4x128(NB,a_ptr,lda,xbuffer,ybuffer,alpha);
|
||||
a_ptr += lda128;
|
||||
dgemv_kernel_4x8(NB,a_ptr,lda,xbuffer,ybuffer,alpha);
|
||||
a_ptr += lda8;
|
||||
}
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
if( n & 4 )
|
||||
{
|
||||
xbuffer[0] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
|
|
Loading…
Reference in New Issue