Merge pull request #3029 from RajalakshmiSR/axpyp10
POWER10: Improve axpy performance
This commit is contained in:
commit
043128cbe5
|
@ -112,10 +112,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
|
|||
"xvmaddasp 38, 58, 33 \n\t"
|
||||
"xvmaddasp 39, 59, 33 \n\t"
|
||||
|
||||
"stxvp 48, 0(%4) \n\t"
|
||||
"stxvp 50, 32(%4) \n\t"
|
||||
"stxvp 34, 64(%4) \n\t"
|
||||
"stxvp 38, 96(%4) \n\t"
|
||||
"stxv 49, 0(%4) \n\t"
|
||||
"stxv 48, 16(%4) \n\t"
|
||||
"stxv 51, 32(%4) \n\t"
|
||||
"stxv 50, 48(%4) \n\t"
|
||||
"stxv 35, 64(%4) \n\t"
|
||||
"stxv 34, 80(%4) \n\t"
|
||||
"stxv 39, 96(%4) \n\t"
|
||||
"stxv 38, 112(%4) \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
"xxperm 52, 40, %x10 \n\t" // exchange real and imag part
|
||||
|
@ -159,10 +163,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
|
|||
"xvmaddasp 38, 58, 33 \n\t"
|
||||
"xvmaddasp 39, 59, 33 \n\t"
|
||||
|
||||
"stxvp 48, 0(%4) \n\t"
|
||||
"stxvp 50, 32(%4) \n\t"
|
||||
"stxvp 34, 64(%4) \n\t"
|
||||
"stxvp 38, 96(%4) \n\t"
|
||||
"stxv 49, 0(%4) \n\t"
|
||||
"stxv 48, 16(%4) \n\t"
|
||||
"stxv 51, 32(%4) \n\t"
|
||||
"stxv 50, 48(%4) \n\t"
|
||||
"stxv 35, 64(%4) \n\t"
|
||||
"stxv 34, 80(%4) \n\t"
|
||||
"stxv 39, 96(%4) \n\t"
|
||||
"stxv 38, 112(%4) \n\t"
|
||||
|
||||
"#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n"
|
||||
:
|
||||
|
|
|
@ -66,12 +66,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if ( n >= 16 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
for (i = 0; i < align; i++) {
|
||||
y[i] += da * x[i] ;
|
||||
}
|
||||
}
|
||||
BLASLONG n1 = (n-i) & -16;
|
||||
if ( n1 )
|
||||
daxpy_kernel_8(n1, &x[i], &y[i], da);
|
||||
|
||||
if ( n1 )
|
||||
daxpy_kernel_8(n1, x, y, da);
|
||||
i += n1;
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
|
|
|
@ -64,12 +64,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
|
||||
if ( n >= 64 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
|
||||
for (i = 0; i < align; i++) {
|
||||
y[i] += da * x[i] ;
|
||||
}
|
||||
}
|
||||
BLASLONG n1 = (n-i) & -64;
|
||||
if ( n1 )
|
||||
saxpy_kernel_64(n1, x, y, da);
|
||||
saxpy_kernel_64(n1, &x[i], &y[i], da);
|
||||
|
||||
i = n1;
|
||||
i += n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
|
|
Loading…
Reference in New Issue