Merge pull request #3014 from RajalakshmiSR/dgemvnp10

POWER10:  Optimize dgemv_n
This commit is contained in:
Martin Kroeker 2020-11-30 08:18:24 +01:00 committed by GitHub
commit 5e81e81478
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 155 additions and 180 deletions

View File

@ -25,14 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/30 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_4x4 1
static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha)
@ -266,3 +258,145 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
);
}
static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y, double alpha)
{
double *a0;
double *a1;
double *a2;
double *a3;
double *a4;
double *a5;
double *a6;
double *a7;
long tmp;
__asm__
(
"lxvp 34, 0( %15) \n\t" // x0, x1
"lxvp 38, 32( %15) \n\t" // x4, x5
XXSPLTD_S(58,%x14,0) // alpha, alpha
"sldi %10, %17, 3 \n\t" // lda * sizeof (double)
"xvmuldp 34, 34, 58 \n\t" // x0 * alpha, x1 * alpha
"xvmuldp 35, 35, 58 \n\t" // x2 * alpha, x3 * alpha
"xvmuldp 38, 38, 58 \n\t" // x4 * alpha, x5 * alpha
"xvmuldp 39, 39, 58 \n\t" // x6 * alpha, x7 * alpha
"li %11, 32 \n\t"
"add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda
"add %10, %10, %10 \n\t" // 2 * lda
XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha
XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha
XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha
XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha
XXSPLTD_S(48,39,1) // x6 * alpha, x6 * alpha
XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha
XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha
XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha
"add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda
"add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda
"add %7, %5, %10 \n\t" // a4 = a2 + 2 * lda
"add %8, %6, %10 \n\t" // a5 = a3 + 2 * lda
"add %9, %7, %10 \n\t" // a6 = a4 + 2 * lda
"add %10, %8, %10 \n\t" // a7 = a5 + 2 * lda
"lxvp 40, 0( %3) \n\t" // a0[0], a0[1]
"lxvp 42, 0( %4) \n\t" // a1[0], a1[1]
"lxvp 44, 0( %5) \n\t" // a2[0], a2[1]
"lxvp 46, 0( %6) \n\t" // a3[0], a3[1]
"lxvp 50, 0( %7) \n\t" // a4[0]
"lxvp 52, 0( %8) \n\t" // a5[0]
"lxvp 54, 0( %9) \n\t" // a6[0]
"lxvp 56, 0( %10) \n\t" // a7[0]
"addic. %1, %1, -4 \n\t"
"ble two%= \n\t"
".align 5 \n"
"one%=: \n\t"
"lxvp 36, 0( %2) \n\t" // y0, y1
"xvmaddadp 36, 40, 34 \n\t"
"xvmaddadp 37, 41, 34 \n\t"
"lxvpx 40, %3, %11 \n\t" // a0[0], a0[1]
"xvmaddadp 36, 42, 35 \n\t"
"xvmaddadp 37, 43, 35 \n\t"
"lxvpx 42, %4, %11 \n\t" // a1[0], a1[1]
"xvmaddadp 36, 44, 32 \n\t"
"xvmaddadp 37, 45, 32 \n\t"
"lxvpx 44, %5, %11 \n\t" // a2[0], a2[1]
"xvmaddadp 36, 46, 33 \n\t"
"xvmaddadp 37, 47, 33 \n\t"
"lxvpx 46, %6, %11 \n\t" // a3[0], a3[1]
"xvmaddadp 36, 50, 48 \n\t"
"xvmaddadp 37, 51, 48 \n\t"
"lxvpx 50, %7, %11 \n\t" // a4[0]
"xvmaddadp 36, 52, 49 \n\t"
"xvmaddadp 37, 53, 49 \n\t"
"lxvpx 52, %8, %11 \n\t" // a5[0]
"xvmaddadp 36, 54, 38 \n\t"
"xvmaddadp 37, 55, 38 \n\t"
"lxvpx 54, %9, %11 \n\t" // a6[0]
"xvmaddadp 36, 56, 39 \n\t"
"xvmaddadp 37, 57, 39 \n\t"
"lxvpx 56, %10, %11 \n\t" // a7[0]
"addi %11, %11, 32 \n\t"
"stxvp 36, 0( %2) \n\t" // y0, y1
"addi %2, %2, 32 \n\t"
"addic. %1, %1, -4 \n\t"
"bgt one%= \n"
"two%=: \n\t"
"lxvp 36, 0( %2) \n\t" // y0, y1
"xvmaddadp 36, 40, 34 \n\t"
"xvmaddadp 37, 41, 34 \n\t"
"xvmaddadp 36, 42, 35 \n\t"
"xvmaddadp 37, 43, 35 \n\t"
"xvmaddadp 36, 44, 32 \n\t"
"xvmaddadp 37, 45, 32 \n\t"
"xvmaddadp 36, 46, 33 \n\t"
"xvmaddadp 37, 47, 33 \n\t"
"xvmaddadp 36, 50, 48 \n\t"
"xvmaddadp 37, 51, 48 \n\t"
"xvmaddadp 36, 52, 49 \n\t"
"xvmaddadp 37, 53, 49 \n\t"
"xvmaddadp 36, 54, 38 \n\t"
"xvmaddadp 37, 55, 38 \n\t"
"xvmaddadp 36, 56, 39 \n\t"
"xvmaddadp 37, 57, 39 \n\t"
"stxvp 36, 0( %2) \n\t" // y0, y1
:
"+m" (*y),
"+r" (n), // 1
"+b" (y), // 2
"=b" (a0), // 3
"=b" (a1), // 4
"=&b" (a2), // 5
"=&b" (a3), // 6
"=&b" (a4), // 7
"=&b" (a5), // 8
"=&b" (a6), // 9
"=&b" (a7), // 10
"=b" (tmp)
:
"m" (*x),
"m" (*ap),
"d" (alpha), // 14
"r" (x), // 15
"3" (ap), // 16
"4" (lda) // 17
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48",
"vs49","vs50","vs51","vs52","vs53","vs54","vs55","vs56", "vs57", "vs58"
);
}

View File

@ -26,165 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <altivec.h>
typedef __vector unsigned char vec_t;
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
typedef __vector_pair __attribute__((aligned(8))) vecp_t;
#include "dgemv_n_microk_power10.c"
#define MMA(X, APTR, ACC) \
rX = (vec_t *) & X; \
rowA = *((vecp_t*)((void*)&APTR)); \
__builtin_mma_xvf64gerpp (ACC, rowA, rX[0]);
#define SAVE(ACC, Z) \
rowC = (v4sf_t *) &y[Z]; \
__builtin_mma_disassemble_acc ((void *)result, ACC); \
result[0][1] = result[1][0]; \
result[2][1] = result[3][0]; \
rowC[0] += valpha * result[0]; \
rowC[1] += valpha * result[2];
void
dgemv_kernel_4x128 (BLASLONG n, FLOAT * a_ptr, BLASLONG lda, FLOAT * xo,
FLOAT * y, FLOAT alpha)
{
BLASLONG i, j, tmp;
FLOAT *a0 = a_ptr;
FLOAT *x1 = xo;
vector double valpha = { alpha, alpha };
v4sf_t *rowC;
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
v4sf_t result[4];
vecp_t rowA;
vec_t *rX;
tmp = (n / 32) * 32;
for (i = 0; i < tmp; i += 32)
{
xo = x1;
a0 = a_ptr;
__builtin_mma_xxsetaccz (&acc0);
__builtin_mma_xxsetaccz (&acc1);
__builtin_mma_xxsetaccz (&acc2);
__builtin_mma_xxsetaccz (&acc3);
__builtin_mma_xxsetaccz (&acc4);
__builtin_mma_xxsetaccz (&acc5);
__builtin_mma_xxsetaccz (&acc6);
__builtin_mma_xxsetaccz (&acc7);
for (j = 0; j < 32; j++)
{
__builtin_prefetch (xo+j);
__builtin_prefetch (a0+i+j+lda);
MMA (xo[j], a0[i + 0 + j * lda], &acc0);
MMA (xo[j], a0[i + 4 + j * lda], &acc1);
MMA (xo[j], a0[i + 8 + j * lda], &acc2);
MMA (xo[j], a0[i + 12 + j * lda], &acc3);
MMA (xo[j], a0[i + 16 + j * lda], &acc4);
MMA (xo[j], a0[i + 20 + j * lda], &acc5);
MMA (xo[j], a0[i + 24 + j * lda], &acc6);
MMA (xo[j], a0[i + 28 + j * lda], &acc7);
}
xo += 32;
a0 += lda << 5;
for (j = 0; j < 32; j++)
{
__builtin_prefetch (xo+j);
__builtin_prefetch (a0+i+j+lda);
MMA (xo[j], a0[i + 0 + j * lda], &acc0);
MMA (xo[j], a0[i + 4 + j * lda], &acc1);
MMA (xo[j], a0[i + 8 + j * lda], &acc2);
MMA (xo[j], a0[i + 12 + j * lda], &acc3);
MMA (xo[j], a0[i + 16 + j * lda], &acc4);
MMA (xo[j], a0[i + 20 + j * lda], &acc5);
MMA (xo[j], a0[i + 24 + j * lda], &acc6);
MMA (xo[j], a0[i + 28 + j * lda], &acc7);
}
xo += 32;
a0 += lda << 5;
for (j = 0; j < 32; j++)
{
__builtin_prefetch (xo+j);
__builtin_prefetch (a0+i+j+lda);
MMA (xo[j], a0[i + 0 + j * lda], &acc0);
MMA (xo[j], a0[i + 4 + j * lda], &acc1);
MMA (xo[j], a0[i + 8 + j * lda], &acc2);
MMA (xo[j], a0[i + 12 + j * lda], &acc3);
MMA (xo[j], a0[i + 16 + j * lda], &acc4);
MMA (xo[j], a0[i + 20 + j * lda], &acc5);
MMA (xo[j], a0[i + 24 + j * lda], &acc6);
MMA (xo[j], a0[i + 28 + j * lda], &acc7);
}
xo += 32;
a0 += lda << 5;
for (j = 0; j < 32; j++)
{
__builtin_prefetch (xo+j);
__builtin_prefetch (a0+i+j+lda);
MMA (xo[j], a0[i + 0 + j * lda], &acc0);
MMA (xo[j], a0[i + 4 + j * lda], &acc1);
MMA (xo[j], a0[i + 8 + j * lda], &acc2);
MMA (xo[j], a0[i + 12 + j * lda], &acc3);
MMA (xo[j], a0[i + 16 + j * lda], &acc4);
MMA (xo[j], a0[i + 20 + j * lda], &acc5);
MMA (xo[j], a0[i + 24 + j * lda], &acc6);
MMA (xo[j], a0[i + 28 + j * lda], &acc7);
}
xo += 32;
a0 += lda << 5;
SAVE (&acc0, i + 0);
SAVE (&acc1, i + 4);
SAVE (&acc2, i + 8);
SAVE (&acc3, i + 12);
SAVE (&acc4, i + 16);
SAVE (&acc5, i + 20);
SAVE (&acc6, i + 24);
SAVE (&acc7, i + 28);
}
for (i = tmp; i < n; i += 4)
{
xo = x1;
a0 = a_ptr;
__builtin_mma_xxsetaccz (&acc0);
for (j = 0; j < 32; j++)
{
__builtin_prefetch (xo+j);
__builtin_prefetch (a0+i+j+lda);
MMA (xo[j], a0[i + j * lda], &acc0);
}
xo += 32;
a0 += lda << 5;
for (j = 0; j < 32; j++)
{
__builtin_prefetch (xo+j);
__builtin_prefetch (a0+i+j+lda);
MMA (xo[j], a0[i + j * lda], &acc0);
}
xo += 32;
a0 += lda << 5;
for (j = 0; j < 32; j++)
{
__builtin_prefetch (xo+j);
__builtin_prefetch (a0+i+j+lda);
MMA (xo[j], a0[i + j * lda], &acc0);
}
xo += 32;
a0 += lda << 5;
for (j = 0; j < 32; j++)
{
__builtin_prefetch (xo+j);
__builtin_prefetch (a0+i+j+lda);
MMA (xo[j], a0[i + j * lda], &acc0);
}
xo += 32;
a0 += lda << 5;
SAVE (&acc0, i);
}
}
#define NBMAX 4096
#ifndef HAVE_KERNEL_4x4
@ -281,13 +125,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4 = lda << 2;
BLASLONG lda128 = lda << 7;
BLASLONG lda8 = lda << 3;
FLOAT xbuffer[8] __attribute__ ((aligned (16)));
FLOAT *ybuffer;
@ -296,9 +139,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
if ( n < 1 ) return(0);
ybuffer = buffer;
BLASLONG n128 = n >> 7;
n1 = (n - (n128 * 128)) >> 2;
n2 = (n - (n128 * 128)) & 3;
BLASLONG n8 = n >> 3;
n2 = n & 3;
m3 = m & 3 ;
m1 = m & -4 ;
@ -329,14 +171,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
if ( inc_x == 1 )
{
for( i = 0; i < n128 ; i++)
for( i = 0; i < n8 ; i++)
{
dgemv_kernel_4x128(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
a_ptr += lda128;
x_ptr += 128;
dgemv_kernel_4x8(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
a_ptr += lda8;
x_ptr += 8;
}
for( i = 0; i < n1 ; i++)
if( n & 4 )
{
dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
a_ptr += lda4;
@ -363,20 +205,19 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
}
else
{
for( i = 0; i < n128 ; i++)
for( i = 0; i < n8 ; i++)
{
FLOAT xbuffer[128] __attribute__ ((aligned (16)));
BLASLONG j;
for ( j = 0; j < 128 ; j++)
for ( j = 0; j < 8 ; j++)
{
xbuffer[j] = x_ptr[0];
x_ptr += inc_x;
}
dgemv_kernel_4x128(NB,a_ptr,lda,xbuffer,ybuffer,alpha);
a_ptr += lda128;
dgemv_kernel_4x8(NB,a_ptr,lda,xbuffer,ybuffer,alpha);
a_ptr += lda8;
}
for( i = 0; i < n1 ; i++)
if( n & 4 )
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;