Merge pull request #3029 from RajalakshmiSR/axpyp10

POWER10: Improve axpy performance
This commit is contained in:
Martin Kroeker 2020-12-10 22:49:28 +01:00 committed by GitHub
commit 043128cbe5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 37 additions and 16 deletions

View File

@ -112,10 +112,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
"xvmaddasp 38, 58, 33 \n\t"
"xvmaddasp 39, 59, 33 \n\t"
"stxvp 48, 0(%4) \n\t"
"stxvp 50, 32(%4) \n\t"
"stxvp 34, 64(%4) \n\t"
"stxvp 38, 96(%4) \n\t"
"stxv 49, 0(%4) \n\t"
"stxv 48, 16(%4) \n\t"
"stxv 51, 32(%4) \n\t"
"stxv 50, 48(%4) \n\t"
"stxv 35, 64(%4) \n\t"
"stxv 34, 80(%4) \n\t"
"stxv 39, 96(%4) \n\t"
"stxv 38, 112(%4) \n\t"
"addi %4, %4, 128 \n\t"
"xxperm 52, 40, %x10 \n\t" // exchange real and imag part
@ -159,10 +163,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
"xvmaddasp 38, 58, 33 \n\t"
"xvmaddasp 39, 59, 33 \n\t"
"stxvp 48, 0(%4) \n\t"
"stxvp 50, 32(%4) \n\t"
"stxvp 34, 64(%4) \n\t"
"stxvp 38, 96(%4) \n\t"
"stxv 49, 0(%4) \n\t"
"stxv 48, 16(%4) \n\t"
"stxv 51, 32(%4) \n\t"
"stxv 50, 48(%4) \n\t"
"stxv 35, 64(%4) \n\t"
"stxv 34, 80(%4) \n\t"
"stxv 39, 96(%4) \n\t"
"stxv 38, 112(%4) \n\t"
"#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n"
:

View File

@ -66,12 +66,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -16;
if ( n >= 16 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
for (i = 0; i < align; i++) {
y[i] += da * x[i] ;
}
}
BLASLONG n1 = (n-i) & -16;
if ( n1 )
daxpy_kernel_8(n1, &x[i], &y[i], da);
if ( n1 )
daxpy_kernel_8(n1, x, y, da);
i += n1;
i = n1;
while(i < n)
{

View File

@ -64,12 +64,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -64;
if ( n >= 64 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
for (i = 0; i < align; i++) {
y[i] += da * x[i] ;
}
}
BLASLONG n1 = (n-i) & -64;
if ( n1 )
saxpy_kernel_64(n1, x, y, da);
saxpy_kernel_64(n1, &x[i], &y[i], da);
i = n1;
i += n1;
while(i < n)
{