POWER10: Improve axpy performance
This patch aligns the stores to 32 byte boundary for saxpy and daxpy before entering into vector pair loop. Fox caxpy, changed the store instructions to stxv to improve performance of unaligned cases.
This commit is contained in:
parent
83de62c20d
commit
346e30a46a
|
@ -112,10 +112,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
|
||||||
"xvmaddasp 38, 58, 33 \n\t"
|
"xvmaddasp 38, 58, 33 \n\t"
|
||||||
"xvmaddasp 39, 59, 33 \n\t"
|
"xvmaddasp 39, 59, 33 \n\t"
|
||||||
|
|
||||||
"stxvp 48, 0(%4) \n\t"
|
"stxv 49, 0(%4) \n\t"
|
||||||
"stxvp 50, 32(%4) \n\t"
|
"stxv 48, 16(%4) \n\t"
|
||||||
"stxvp 34, 64(%4) \n\t"
|
"stxv 51, 32(%4) \n\t"
|
||||||
"stxvp 38, 96(%4) \n\t"
|
"stxv 50, 48(%4) \n\t"
|
||||||
|
"stxv 35, 64(%4) \n\t"
|
||||||
|
"stxv 34, 80(%4) \n\t"
|
||||||
|
"stxv 39, 96(%4) \n\t"
|
||||||
|
"stxv 38, 112(%4) \n\t"
|
||||||
|
|
||||||
"addi %4, %4, 128 \n\t"
|
"addi %4, %4, 128 \n\t"
|
||||||
"xxperm 52, 40, %x10 \n\t" // exchange real and imag part
|
"xxperm 52, 40, %x10 \n\t" // exchange real and imag part
|
||||||
|
@ -159,10 +163,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
|
||||||
"xvmaddasp 38, 58, 33 \n\t"
|
"xvmaddasp 38, 58, 33 \n\t"
|
||||||
"xvmaddasp 39, 59, 33 \n\t"
|
"xvmaddasp 39, 59, 33 \n\t"
|
||||||
|
|
||||||
"stxvp 48, 0(%4) \n\t"
|
"stxv 49, 0(%4) \n\t"
|
||||||
"stxvp 50, 32(%4) \n\t"
|
"stxv 48, 16(%4) \n\t"
|
||||||
"stxvp 34, 64(%4) \n\t"
|
"stxv 51, 32(%4) \n\t"
|
||||||
"stxvp 38, 96(%4) \n\t"
|
"stxv 50, 48(%4) \n\t"
|
||||||
|
"stxv 35, 64(%4) \n\t"
|
||||||
|
"stxv 34, 80(%4) \n\t"
|
||||||
|
"stxv 39, 96(%4) \n\t"
|
||||||
|
"stxv 38, 112(%4) \n\t"
|
||||||
|
|
||||||
"#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n"
|
"#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n"
|
||||||
:
|
:
|
||||||
|
|
|
@ -66,12 +66,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||||
if ( (inc_x == 1) && (inc_y == 1) )
|
if ( (inc_x == 1) && (inc_y == 1) )
|
||||||
{
|
{
|
||||||
|
|
||||||
BLASLONG n1 = n & -16;
|
if ( n >= 16 )
|
||||||
|
{
|
||||||
|
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||||
|
for (i = 0; i < align; i++) {
|
||||||
|
y[i] += da * x[i] ;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
BLASLONG n1 = (n-i) & -16;
|
||||||
|
if ( n1 )
|
||||||
|
daxpy_kernel_8(n1, &x[i], &y[i], da);
|
||||||
|
|
||||||
if ( n1 )
|
i += n1;
|
||||||
daxpy_kernel_8(n1, x, y, da);
|
|
||||||
|
|
||||||
i = n1;
|
|
||||||
while(i < n)
|
while(i < n)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
|
|
@ -64,12 +64,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||||
if ( (inc_x == 1) && (inc_y == 1) )
|
if ( (inc_x == 1) && (inc_y == 1) )
|
||||||
{
|
{
|
||||||
|
|
||||||
BLASLONG n1 = n & -64;
|
if ( n >= 64 )
|
||||||
|
{
|
||||||
|
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
|
||||||
|
for (i = 0; i < align; i++) {
|
||||||
|
y[i] += da * x[i] ;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
BLASLONG n1 = (n-i) & -64;
|
||||||
if ( n1 )
|
if ( n1 )
|
||||||
saxpy_kernel_64(n1, x, y, da);
|
saxpy_kernel_64(n1, &x[i], &y[i], da);
|
||||||
|
|
||||||
i = n1;
|
i += n1;
|
||||||
while(i < n)
|
while(i < n)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue